# Cleaning Pipeline

In [1]:
#Importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute, preprocessing, pipeline, model_selection, compose

In [2]:
df = pd.read_csv('spaceTrain.csv')
# removing useless columns
features = df.drop(['PassengerId','Cabin','Name','Transported'],axis = 1)
target = df.Transported
numerical_features = [feat for feat in features if features[feat].dtypes !='O']
categorical_features = [feat for feat in features if feat not in numerical_features]
ordinal_features = ['VIP','CryoSleep']
nominal_features = ['HomePlanet','Destination']

## Split your dataset into training and testing dataset
x_train,x_test,y_train,y_test = model_selection.train_test_split(features,target,test_size = 0.2,stratify=target)

In [3]:
def fix_outlier(X):
    df = X.T
    #df['Transported'] = y
    
    #indices = [x for x in df.index]    
    out_indexlist = []
    for col in range(6):
        upper_indices = []
        lower_indices = []
        #Using nanpercentile instead of percentile because of nan values
        Q1 = np.percentile(df[col], 25.)
        Q3 = np.percentile(df[col], 75.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
                
        #upper_indices = df[col][df[col] < lower].index.tolist()
        df[col][df[col] < lower] = lower
        #lower_indices = df[col][df[col] > upper].index.tolist()
        df[col][df[col] > upper] = upper
        #X.loc[upper_indices][col] = upper
        #X.loc[lower_indices][col] = lower
        #outliers = df[col][(df[col] < lower) | (df[col] > upper)].values        
        #out_indexlist.extend(outliers_index)
        
        
    #using set to remove duplicates
    #out_indexlist = list(set(out_indexlist))
    
    #clean_data = np.setdiff1d(indices,out_indexlist)

    return df.T

handle_outlier = preprocessing.FunctionTransformer(fix_outlier)


In [4]:
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
6893,Mars,True,TRAPPIST-1e,29.0,False,0.0,0.0,0.0,0.0,0.0
6248,Earth,False,PSO J318.5-22,20.0,False,0.0,90.0,561.0,2.0,0.0
5357,Earth,False,TRAPPIST-1e,44.0,False,69.0,0.0,17.0,176.0,2105.0
218,Earth,True,PSO J318.5-22,27.0,False,0.0,,0.0,0.0,0.0
3400,Earth,True,TRAPPIST-1e,4.0,False,0.0,0.0,0.0,0.0,0.0


In [5]:
imputer = compose.make_column_transformer(
    (impute.SimpleImputer(strategy='mean'),[3,5,6,7,8,9]),
    (impute.SimpleImputer(strategy='most_frequent'),[1,4]),
    (impute.SimpleImputer(strategy='constant',fill_value='not_specified'),[0,2]),
    remainder='passthrough'
)

encoding = compose.make_column_transformer(
    (preprocessing.OrdinalEncoder(),[6,7]),
    (preprocessing.OneHotEncoder(drop='first'),[8,9]),
    remainder='passthrough'
)

cleaning_pipeline = pipeline.make_pipeline(imputer,encoding)

In [10]:
import joblib 
joblib.dump(cleaning_pipeline,'clean.joblib')

['clean.joblib']

In [9]:
#clean_pipe = joblib.load('clean.joblib')