In [593]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [594]:
train = pd.read_csv('train.csv').dropna()

In [595]:
train.shape

(6606, 14)

In [596]:
X_train = train.drop('Transported',axis=1)
X_target = train.Transported

In [597]:
col_n = X_train.select_dtypes(include= np.number)

In [598]:
from sklearn.impute import SimpleImputer

# NUMERICAL DATA

## Approach 1

In [599]:
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(col_n))
imputed_X_train.columns = col_n.columns

In [600]:
imputed_X_train.isnull().sum()

Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [601]:
imputed_X_train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,39.0,0.0,0.0,0.0,0.0,0.0
1,24.0,109.0,9.0,25.0,549.0,44.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0
4,16.0,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...
6601,41.0,0.0,6819.0,0.0,1643.0,74.0
6602,18.0,0.0,0.0,0.0,0.0,0.0
6603,26.0,0.0,0.0,1872.0,1.0,0.0
6604,32.0,0.0,1049.0,0.0,353.0,3235.0


## Approach 2

In [602]:
imp_truefalse_train = X_train.copy()

# below line returns the columns having null values
cols_with_missing = [col for col in imp_truefalse_train.columns
                     if imp_truefalse_train[col].isnull().any()]
for col in cols_with_missing:
    imp_truefalse_train[col + '_was_missing'] = imp_truefalse_train[col].isnull()

In [603]:
imp_truefalse_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


# CATEGORCAL DATA

In [604]:
X_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [605]:
X_train.Name.nunique()

6590

In [606]:
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 15 and 
                        X_train[cname].dtype == "object"]

In [607]:
low_cardinality_cols

['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# ORDINAL VARIABLE 
* We dont have any ordinal variable in this data so just try to encode destination (for the sake of understanding the function)

In [608]:
from sklearn.preprocessing import OrdinalEncoder

In [609]:
cat = X_train[low_cardinality_cols]
cat.Destination.value_counts()

TRAPPIST-1e      4576
55 Cancri e      1407
PSO J318.5-22     623
Name: Destination, dtype: int64

In [610]:
oe = OrdinalEncoder()
cat['Destination']  = pd.DataFrame(oe.fit_transform(X_train[['Destination']]))

In [611]:
cat.Destination.value_counts()

2.0    3468
0.0    1073
1.0     458
Name: Destination, dtype: int64

# FOR FINDIND GOOD AND BAD COLUMNS IN THE DATA

In [612]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

NameError: name 'X_valid' is not defined

# OneHotEncoder

In [613]:
cat.shape

(6606, 4)

In [614]:
cat.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars'], dtype=object)

In [622]:
cat.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP
0,Europa,False,2.0,False
1,Earth,False,2.0,False
2,Europa,False,2.0,True
3,Europa,False,2.0,False
4,Earth,False,2.0,False


In [615]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [618]:
ohe = OneHotEncoder(sparse=False)

In [626]:
hp_dummies = pd.DataFrame(ohe.fit_transform(cat[['HomePlanet']]))

In [640]:
ohe.categories_[0]

array(['Earth', 'Europa', 'Mars'], dtype=object)

In [642]:
hp_dummies.columns = ohe.categories_[0]

In [643]:
hp_dummies

Unnamed: 0,Earth,Europa,Mars
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
6601,0.0,1.0,0.0
6602,1.0,0.0,0.0
6603,1.0,0.0,0.0
6604,0.0,1.0,0.0
