In [1]:

from yapf.yapflib.yapf_api import FormatCode

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import PowerTransformer

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

import pickle
from scipy.stats import norm
import warnings
warnings.filterwarnings("ignore")




def apply_preprocessing(raw_df, is_training=False):

        # common pre-processing steps
        #############################################
        # if dataset is train ==>use  Train esle use test as raw_df
        
        train, test = train_test_split(raw_df, shuffle=True, test_size=0.2, random_state=50)
        test.to_csv('Dataset/test.csv', header=True, index=False, sep=',')
        
        #X_train Feature Matrix  and  #y_train Target Variable
        X_train, y_train = train.drop(columns='y'), train['y'] 
       
        #Convert into Dataframe
        #train = pd.DataFrame(train)

        #Separate numerical , and categorical variables 
        X_train_numeric_data = X_train.select_dtypes(include=[np.number])
        X_train_categorical_data = X_train.select_dtypes(exclude=[np.number])

        #############################################

        if is_training:

    # scaler, encoder, transform
          # 1) fit and  2) tranform
        
                 #PowerTransformer on numerical variables 
            power = PowerTransformer()
            X_train_numeric_data_Power = power.fit_transform(X_train_numeric_data)

                    #OneHotEncoder on categoricals  variables 
            encoder = OneHotEncoder(sparse=False, drop=None)
            X_train_categorical_data_encorder = encoder.fit_transform(X_train_categorical_data)


                    #Dataframe
            X_train_numeric_data_Power = pd.DataFrame(X_train_numeric_data_Power, columns=X_train_numeric_data.columns)
            X_train_categorical_data_encorder = pd.DataFrame(X_train_categorical_data_encorder, columns=encoder.get_feature_names_out())

                    # Merge the two pre-processed datasets, the first containing only numeric variables and the second containing only categorical variables
            pp_X_train = pd.merge(X_train_numeric_data_Power,
                                    X_train_categorical_data_encorder, how='inner',
                                    left_index=True, 
                                    right_index=True 
               
                              )
            y_train.reset_index(drop=True, inplace=True)
            y_train = y_train.astype('int')
            
            
            
                # Save/serialize the fitted encoder to local OS
            with open('pickle/OneHotEncoder.pkl', 'wb') as output_file:
                pickle.dump(encoder, output_file)

                # Save/serialize the fitted encoder to local OS
            with open('pickle/PowerTransformer.pkl', 'wb') as output_file:
                pickle.dump(power, output_file)
                
           
            
            return pp_X_train , y_train
            # 3) save
        
        

            
        else:

                # scaler, encoder, transform
            X_test, y_test = raw_df.drop(columns=['y']), raw_df['y']
            # 1) load
                # Load the fitted labelencoder from local OS
            with open('pickle/OneHotEncoder.pkl', 'rb') as input_file:
                encoder = pickle.load(input_file)


                # Load the fitted labelencoder from local OS
            with open('pickle/PowerTransformer.pkl', 'rb') as input_file:
                transformer = pickle.load(input_file)
                
               # Load the fitted model from local OS
            #with open('pickle/model.pkl', 'rb') as input_file:
           #     model = pickle.load(input_file)

        # 2) transform
        
            #Separate numerical , and categorical variables 

            X_test_numeric_data = X_test.select_dtypes(include=[np.number])
            X_test_categorical_data = X_test.select_dtypes(exclude=[np.number])
                
            X_test_numeric_data = pd.DataFrame(data=transformer.transform(X_test_numeric_data),columns=X_test_numeric_data.columns)
            X_test_categorical_data =pd.DataFrame(data=encoder.transform(X_test_categorical_data), columns=encoder.get_feature_names_out()) 
            
            pp_X_test = pd.merge(left=X_test_numeric_data, right=X_test_categorical_data, how='inner',left_index=True, right_index=True)
             
            y_test.reset_index(drop=True, inplace=True)
            y_test = y_test.astype('int')
            
           
            return pp_X_test , y_test
        
        
        


def apply_preprocessing_V2(raw_df, is_training=False):
    
        #X_train Feature Matrix  and  #y_train Target Variable
        X, y = raw_df.drop(columns='y'), raw_df['y'] 
    
        #Separate numerical , and categorical variables 
        X_numeric_data = X.select_dtypes(include=[np.number])
        X_categorical_data = X.select_dtypes(exclude=[np.number])
        
        y.reset_index(drop=True, inplace=True)
        y = y.astype('int')
            
            
        if is_training:

    # scaler, encoder, transform
          # 1) fit and  2) tranform
        
                 #PowerTransformer on numerical variables 
            power = PowerTransformer()
            X_numeric_data = power.fit_transform(X_numeric_data)

                    #OneHotEncoder on categoricals  variables 
            encoder = OneHotEncoder(sparse=False, drop=None)
            X_categorical_data = encoder.fit_transform(X_categorical_data)

            # 3) save
                # Save/serialize the fitted encoder to local OS
            with open('pickle/OneHotEncoder.pkl', 'wb') as output_file:
                pickle.dump(encoder, output_file)

                # Save/serialize the fitted encoder to local OS
            with open('pickle/PowerTransformer.pkl', 'wb') as output_file:
                pickle.dump(power, output_file)
            
        else:

            # 1) load
                # Load the fitted labelencoder from local OS
            with open('pickle/OneHotEncoder.pkl', 'rb') as input_file:
                encoder = pickle.load(input_file)


                # Load the fitted labelencoder from local OS
            with open('pickle/PowerTransformer.pkl', 'rb') as input_file:
                transformer = pickle.load(input_file)
            
            
            X_numeric_data = transformer.transform(X_numeric_data)
            X_categorical_data =encoder.transform(X_categorical_data)
        # 2) transform
        
            #Separate numerical , and categorical variables 
            
        #Dataframe
        X_numeric_data = pd.DataFrame(X_numeric_data, columns=X_numeric_data.columns)
        X_categorical_data = pd.DataFrame(X_categorical_data, columns=encoder.get_feature_names_out())
        
        #mege  
        pp_X = pd.merge(left=X_numeric_data, right=X_categorical_data, how='inner',left_index=True, right_index=True)
            
           
        return pp_X , y


In [2]:
#version 3 

def apply_preprocessing_V3(raw_df, is_training=False):
    
        #X_train Feature Matrix  and  #y_train Target Variable
        X, y = raw_df.drop(columns='y'), raw_df['y'] 
    
        #Separate numerical , and categorical variables 
        X_numeric_data = X.select_dtypes(include=[np.number])
        X_categorical_data = X.select_dtypes(exclude=[np.number])
        
        y.reset_index(drop=True, inplace=True)
        y = y.astype('int')
            
            
        if is_training:

    # scaler, encoder, transform
          # 1) fit and  2) tranform
        
                 #PowerTransformer on numerical variables 
            power = PowerTransformer()
            power.fit(X_numeric_data)

                    #OneHotEncoder on categoricals  variables 
            encoder = OneHotEncoder(sparse=False, drop=None)
            encoder.fit(X_categorical_data)

            # 3) save
                # Save/serialize the fitted encoder to local OS
            with open('pickle/OneHotEncoder.pkl', 'wb') as output_file:
                pickle.dump(encoder, output_file)

                # Save/serialize the fitted encoder to local OS
            with open('pickle/PowerTransformer.pkl', 'wb') as output_file:
                pickle.dump(power, output_file)
            
        else:

            # 1) load
                # Load the fitted labelencoder from local OS
            with open('pickle/OneHotEncoder.pkl', 'rb') as input_file:
                encoder = pickle.load(input_file)


                # Load the fitted labelencoder from local OS
            with open('pickle/PowerTransformer.pkl', 'rb') as input_file:
                transformer = pickle.load(input_file)
            
            
        X_numeric_data = transformer.transform(X_numeric_data)
        X_categorical_data =encoder.transform(X_categorical_data)
        # 2) transform
        
            #Separate numerical , and categorical variables 
            
        #Dataframe
        X_numeric_data = pd.DataFrame(X_numeric_data,
                                      columns=X_numeric_data.columns
                                     )
        X_categorical_data = pd.DataFrame(X_categorical_data, columns=encoder.get_feature_names_out())
        
        #mege  
        pp_X = pd.merge(left=X_numeric_data, right=X_categorical_data, how='inner',left_index=True, right_index=True)
            
           
        return pp_X , y

In [3]:
#version 3 

def apply_preprocessing_V4(raw_df, is_training=False):
    
        #X_train Feature Matrix  and  #y_train Target Variable
        X = raw_df.iloc[: , :-1]
        y = raw_df.iloc[ :, -1:] 
    
        #Separate numerical , and categorical variables 
        X_numeric_data = X.select_dtypes(include=[np.number])
        X_categorical_data = X.select_dtypes(exclude=[np.number])
        
        y.reset_index(drop=True, inplace=True)
        y = y.astype('int')
            
            
        if is_training:

    # scaler, encoder, transform
          # 1) fit and  2) tranform
        
                 #PowerTransformer on numerical variables 
            power = PowerTransformer()
            power.fit(X_numeric_data)

                    #OneHotEncoder on categoricals  variables 
            encoder = OneHotEncoder(sparse=False, drop=None)
            encoder.fit(X_categorical_data)

            # 3) save
                # Save/serialize the fitted encoder to local OS
            with open('pickle/OneHotEncoder.pkl', 'wb') as output_file:
                pickle.dump(encoder, output_file)

                # Save/serialize the fitted encoder to local OS
            with open('pickle/PowerTransformer.pkl', 'wb') as output_file:
                pickle.dump(power, output_file)
            
        else:

            # 1) load
                # Load the fitted labelencoder from local OS
            with open('pickle/OneHotEncoder.pkl', 'rb') as input_file:
                encoder = pickle.load(input_file)


                # Load the fitted labelencoder from local OS
            with open('pickle/PowerTransformer.pkl', 'rb') as input_file:
                transformer = pickle.load(input_file)
            
            
        X_numeric_data = transformer.transform(X_numeric_data)
        X_categorical_data =encoder.transform(X_categorical_data)
        # 2) transform
        
            #Separate numerical , and categorical variables 
            
        #Dataframe
        X_numeric_data = pd.DataFrame(X_numeric_data,
                                      #columns=X_numeric_data.columns
                                     )
        X_categorical_data = pd.DataFrame(X_categorical_data, 
                                          #columns=encoder.get_feature_names_out()
                                         )
        
        #mege  
        pp_X = pd.merge(left=X_numeric_data, right=X_categorical_data, how='inner',left_index=True, right_index=True)
            
           
        return pp_X , y