In [111]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Loading Training and Test data

In [112]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [113]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [114]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [115]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [116]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Cleaning Training and Test Data

In [117]:
# Since Passenger Id is not significant we are dropping it from both test and train data 

In [118]:
df_train.drop('PassengerId',axis=1,inplace=True)
df_test.drop('PassengerId',axis=1,inplace=True)

In [119]:
#Checking for any null values in our datasets

In [120]:
df_train.isna().sum() #TRAIN DATA

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [121]:
df_test.isna().sum() #TEST DATA

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [122]:
# We see more than 70% data is Nan or Null for Cabin in both train and Test data 
# So we can drop Cabin as well from the datasets 

In [123]:
df_train.drop('Cabin',axis=1,inplace=True)  #TRAIN
df_test.drop('Cabin',axis=1,inplace=True)   #TEST 

In [124]:
# For Embarked column we have 2 missing values in Train data set so we will just impute these 2 values with 
#the most frequently occuring value in Embarked column
# No need in test data as there is no missing value in it for Embarked 

In [125]:
df_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [126]:
df_train['Embarked'] = df_train['Embarked'].fillna('S')

In [127]:
df_train['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [128]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [129]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 29.5+ KB


# Handling Categoriocal Variables

In [130]:
# Converting male to 0 and female as 1 

In [131]:
df_train.loc[df_train['Sex'] == 'male','Sex'] = 0    #TRAIN
df_train.loc[df_train['Sex'] == 'female','Sex']=1 
df_train['Sex'] = df_train['Sex'].astype(np.int64) 

In [132]:
df_test.loc[df_test['Sex'] == 'male','Sex'] = 0   #TEST
df_test.loc[df_test['Sex'] == 'female','Sex']=1 
df_test['Sex'] = df_test['Sex'].astype(np.int64) 

# NOTE :
-->whenever we change it data using assignment operator it  gets converted to 'Object' type so we use astype
   to convert it into int


In [133]:
# Now we see that Ticket also has more than 80% unique values which wont add any significance 
# So we will drop them 

In [134]:
print(df_train['Ticket'].nunique())
print(df_test['Ticket'].nunique())

681
363


In [135]:
df_train.drop('Ticket',axis=1,inplace=True)    #TRAIN 
df_test.drop('Ticket',axis=1,inplace=True)     #TEST

# Using One Hot Encoding for [Embarked and Name]

In [136]:
from sklearn.preprocessing import OneHotEncoder

In [137]:
# EMBARKED For TRAIN dataset

In [138]:
e_encoder = OneHotEncoder(sparse=False)   # Object creation
e_encoder.fit(df_train[['Embarked']])     # Fitting the object
e_one_hot_np = e_encoder.transform(df_train[['Embarked']])   # Transforming the object
e_one_hot_np = e_one_hot_np.astype(np.int64)   # Converting datatype to int from  Object
e_one_hot_df = pd.DataFrame(e_one_hot_np,columns=['Emb_1','Emb_2','Emb_3'])  #Embarked dataframe created
df_train = pd.concat([df_train,e_one_hot_df],axis=1)  # concatenating the EMbarked datframe with train dataframe
df_train.drop('Embarked',axis=1,inplace=True)   # dropping Embarked column

In [139]:
# EMBARKED For TEST dataset

In [140]:
et_encoder = OneHotEncoder(sparse=False)
et_encoder.fit(df_test[['Embarked']])
et_one_hot_np = et_encoder.transform(df_test[['Embarked']])
et_one_hot_np = et_one_hot_np.astype(np.int64)
et_one_hot_df = pd.DataFrame(et_one_hot_np,columns=['Emb_1','Emb_2','Emb_3'])  #Embarked dataframe created
df_test = pd.concat([df_test,et_one_hot_df],axis=1)
df_test.drop('Embarked',axis=1,inplace=True)

In [141]:
# NAME for Train dataset

In [142]:
df_train.loc[df_train["Name"].str.contains("Miss. "), "Name"] = "Miss"
df_train.loc[df_train["Name"].str.contains("Mrs. "), "Name"] = "Mrs"
df_train.loc[df_train["Name"].str.contains("Ms. "), "Name"] = "Ms"
df_train.loc[df_train["Name"].str.contains("Master. "), "Name"] = "Master"
df_train.loc[df_train["Name"].str.contains("Mr. |Mr "), "Name"] = "Mr"

print(df_train["Name"].value_counts())

Mr                                                          520
Miss                                                        182
Mrs                                                         125
Master                                                       40
Harper, Rev. John                                             1
Sagesser, Mlle. Emma                                          1
Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)      1
Pain, Dr. Alfred                                              1
Stahelin-Maeglin, Dr. Max                                     1
Kirkland, Rev. Charles Leonard                                1
Uruchurtu, Don. Manuel E                                      1
Leader, Dr. Alice (Farnham)                                   1
Brewe, Dr. Arthur Jackson                                     1
Weir, Col. John                                               1
Reuchlin, Jonkheer. John George                               1
Carter, Rev. Ernest Courtenay           

In [143]:
# Name for TEST data set

In [144]:
df_test.loc[df_test['Name'].str.contains('Miss. '),'Name'] = 'Miss'
df_test.loc[df_test["Name"].str.contains("Mrs. "),'Name'] = "Mrs"
df_test.loc[df_test["Name"].str.contains("Ms. "),'Name'] = "Ms"
df_test.loc[df_test["Name"].str.contains("Master. "),'Name'] = "Master"
df_test.loc[df_test["Name"].str.contains("Mr. "),'Name'] = "Mr"

print(df_test["Name"].value_counts())

Mr                               240
Miss                              78
Mrs                               72
Master                            21
Dodge, Dr. Washington              1
Ms                                 1
Astor, Col. John Jacob             1
Peruschitz, Rev. Joseph Maria      1
Lahtinen, Rev. William             1
Oliva y Ocana, Dona. Fermina       1
Gracie, Col. Archibald IV          1
Name: Name, dtype: int64


In [145]:
# Now we have a Ms value in both train and test dataset so we will replace that with Miss

In [146]:
df_train.loc[df_train["Name"].str.contains("Ms"), "Name"] = "Miss"    #TRAIN
print(df_train["Name"].value_counts())

Mr                                                          520
Miss                                                        183
Mrs                                                         125
Master                                                       40
Harper, Rev. John                                             1
Carter, Rev. Ernest Courtenay                                 1
Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)      1
Pain, Dr. Alfred                                              1
Stahelin-Maeglin, Dr. Max                                     1
Kirkland, Rev. Charles Leonard                                1
Uruchurtu, Don. Manuel E                                      1
Leader, Dr. Alice (Farnham)                                   1
Brewe, Dr. Arthur Jackson                                     1
Weir, Col. John                                               1
Reuchlin, Jonkheer. John George                               1
Bateman, Rev. Robert James              

In [147]:
df_test.loc[df_test["Name"].str.contains("Ms"), "Name"] = "Miss"   #TEST
print(df_test["Name"].value_counts())

Mr                               240
Miss                              79
Mrs                               72
Master                            21
Dodge, Dr. Washington              1
Astor, Col. John Jacob             1
Peruschitz, Rev. Joseph Maria      1
Lahtinen, Rev. William             1
Oliva y Ocana, Dona. Fermina       1
Gracie, Col. Archibald IV          1
Name: Name, dtype: int64


In [148]:
#Apart from Mr,Mrs,Miss,Master we have individul names who doesn't have a title for their name .
# So we will club them together and into NaN values first 
# Then we will sort them replace them with any of the four categories based on then (Sex and Age) accordingly


In [149]:
df_train.loc[~df_train["Name"]. \
            str.contains("Miss|Mrs|Master|Mr"), "Name"] = "NaN"
print(df_train["Name"].value_counts())

Mr        520
Miss      183
Mrs       125
Master     40
NaN        23
Name: Name, dtype: int64


In [150]:
df_test.loc[~df_test["Name"].str.contains("Miss|Mrs|Master|Mr",na=True), "Name"] = "NaN"
print(df_test["Name"].value_counts())

Mr        240
Miss       79
Mrs        72
Master     21
NaN         6
Name: Name, dtype: int64


In [151]:
# we are considering a threshold of 25 years so 
# anyone >25 years will be Mr for male and Mrs for female
# anyone <= 25 years will be Master for male and Miss for female

In [152]:
df_train.loc[(df_train["Name"] == "NaN") & ((df_train["Age"]>25) |(df_train["Age"] == "NaN"))&
            (df_train["Sex"] == 1), "Name"] = "Mrs"
df_train.loc[(df_train["Name"] == "NaN") & ((df_train["Age"]>25) |(df_train["Age"] == "NaN"))&
            (df_train["Sex"] == 0), "Name"] = "Mr"
df_train.loc[(df_train["Name"] == "NaN") & ((df_train["Age"]<25) |(df_train["Age"] == "NaN"))&
            (df_train["Sex"] == 1), "Name"] = "Miss"
df_train.loc[(df_train["Name"] == "NaN") & ((df_train["Age"]<25) |(df_train["Age"] == "NaN"))&
            (df_train["Sex"] == 0), "Name"] = "Master"
print(df_train["Name"].value_counts())

Mr        537
Miss      185
Mrs       127
Master     41
NaN         1
Name: Name, dtype: int64


  res_values = method(rvalues)


In [153]:
df_test.loc[(df_test["Name"] == "NaN") & ((df_test["Age"]>25) |(df_test["Age"] == "NaN"))&
            (df_test["Sex"] == 1), "Name"] = "Mrs"
df_test.loc[(df_test["Name"] == "NaN") & ((df_test["Age"]>25) |(df_test["Age"] == "NaN"))&
            (df_test["Sex"] == 0), "Name"] = "Mr"
df_test.loc[(df_test["Name"] == "NaN") & ((df_test["Age"]<25) |(df_test["Age"] == "NaN"))&
            (df_test["Sex"] == 1), "Name"] = "Miss"
df_test.loc[(df_test["Name"] == "NaN") & ((df_test["Age"]<25) |(df_test["Age"] == "NaN"))&
            (df_test["Sex"] == 0), "Name"] = "Master"
print(df_test["Name"].value_counts())

Mr        245
Miss       79
Mrs        73
Master     21
Name: Name, dtype: int64


In [154]:
#Still we see 1 NaN for TRAIN dataset

In [155]:
df_train[df_train["Name"]=='NaN']  # finding the index location we see its age is as well NaN

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Emb_1,Emb_2,Emb_3
766,0,1,,0,,0,0,39.6,1,0,0


In [156]:
df_train.iloc[766,2] = 'Mr' 
# we are repalcing that name with Mr directly since it is Male and has not survived as chances of 
# male survival are less and this index has 0 as survived value

In [157]:
print(df_train["Name"].value_counts())

Mr        538
Miss      185
Mrs       127
Master     41
Name: Name, dtype: int64


In [158]:
# Encoding Name for TRAIN dataset

In [159]:
n_encoder = OneHotEncoder(sparse=False)
n_encoder.fit(df_train[['Name']])
name_one_hot_np = n_encoder.transform(df_train[['Name']])
name_one_hot_np = name_one_hot_np.astype(np.int64)
name_one_hot_np
name_one_hot_df = pd.DataFrame(name_one_hot_np,columns=['Name_1','Name_2','Name_3','Name_4'])
df_train = pd.concat([df_train,name_one_hot_df],axis=1)
df_train.drop('Name',axis=1,inplace=True)

In [160]:
# Encoding Name for TEST dataset

In [161]:
nt_encoder = OneHotEncoder(sparse=False)
nt_encoder.fit(df_test[['Name']])
namet_one_hot_np = nt_encoder.transform(df_test[['Name']])
namet_one_hot_np = namet_one_hot_np.astype(np.int64)
namet_one_hot_df = pd.DataFrame(namet_one_hot_np,columns=['Name_1','Name_2','Name_3','Name_4'])
df_test = pd.concat([df_test,namet_one_hot_df],axis=1)
df_test.drop('Name',axis=1,inplace=True)

# Regressor using XGBoost to impute AGE values 

In [162]:
# For TRAIN data

In [163]:
reg_age = df_train.copy()
reg_age_train_X = reg_age[reg_age['Age'].notnull().copy()]
reg_age_train_Y = reg_age_train_X[['Age']]
reg_age_train_X.drop("Age", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [164]:
# For Test data

In [165]:
reg_age_test = df_test.copy()
reg_age_train_X_test = reg_age_test[reg_age_test['Age'].notnull().copy()]
reg_age_train_Y_test = reg_age_train_X_test[['Age']]
reg_age_train_X_test.drop("Age", axis=1, inplace=True)

In [166]:
import xgboost as xgb
from xgboost import plot_importance

In [167]:
# For TRAIN data
reg = xgb.XGBRegressor(colsample_bylevel = 0.7,
                      colsample_bytree = 0.5,
                      learning_rate =0.3,
                      max_depth =5,
                      min_child_weight = 1.5,
                      n_estimators = 18,
                      subsample =0.9)

In [168]:
# For TEST data
reg_test = xgb.XGBRegressor(colsample_bylevel = 0.7,
                      colsample_bytree = 0.5,
                      learning_rate =0.3,
                      max_depth =5,
                      min_child_weight = 1.5,
                      n_estimators = 18,
                      subsample =0.9)

In [169]:
reg.fit(reg_age_train_X,reg_age_train_Y) # train model fitting

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=5,
             min_child_weight=1.5, missing=nan, monotone_constraints='()',
             n_estimators=18, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [170]:
reg_test.fit(reg_age_train_X_test,reg_age_train_Y_test)  #test model fitting

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=5,
             min_child_weight=1.5, missing=nan, monotone_constraints='()',
             n_estimators=18, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [171]:
age_pred = reg.predict(df_train.loc[:,df_train.columns !='Age'])
age_pred_test = reg_test.predict(df_test.loc[:,df_test.columns !='Age'])

In [172]:
df_train.loc[df_train["Age"].isnull(),"Age"] = age_pred[df_train["Age"].isnull()]
df_test.loc[df_test["Age"].isnull(),"Age"] = age_pred_test[df_test["Age"].isnull()]

In [173]:
df_train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Emb_1       0
Emb_2       0
Emb_3       0
Name_1      0
Name_2      0
Name_3      0
Name_4      0
dtype: int64

In [174]:
df_test.isnull().sum()

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      1
Emb_1     0
Emb_2     0
Emb_3     0
Name_1    0
Name_2    0
Name_3    0
Name_4    0
dtype: int64

In [175]:
# We see there is a null value in Fare column
#We will find the Pclass for that index and then impute it with the average fare value for that Pclass

In [176]:
df_test[df_test['Fare'].isnull()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Emb_1,Emb_2,Emb_3,Name_1,Name_2,Name_3,Name_4
152,3,0,60.5,0,0,,0,0,1,0,0,1,0


In [177]:
#Finding avg Pcalss=3 fare and then imputing it at Nan value
df_test.loc[df_test['Pclass'] == 3,'Fare'].sum()/len(df_test.loc[df_test['Pclass'] == 3,'Fare']) 

12.402523394495415

In [178]:
df_test['Fare'].fillna(12.40,inplace=True)

In [179]:
df_test.isnull().sum()

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
Emb_1     0
Emb_2     0
Emb_3     0
Name_1    0
Name_2    0
Name_3    0
Name_4    0
dtype: int64

In [180]:
df_test.loc[152,:]

Pclass     3.0
Sex        0.0
Age       60.5
SibSp      0.0
Parch      0.0
Fare      12.4
Emb_1      0.0
Emb_2      0.0
Emb_3      1.0
Name_1     0.0
Name_2     0.0
Name_3     1.0
Name_4     0.0
Name: 152, dtype: float64

In [181]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Emb_1,Emb_2,Emb_3,Name_1,Name_2,Name_3,Name_4
0,0,3,0,22.0,1,0,7.25,0,0,1,0,0,1,0
1,1,1,1,38.0,1,0,71.2833,1,0,0,0,0,0,1
2,1,3,1,26.0,0,0,7.925,0,0,1,0,1,0,0
3,1,1,1,35.0,1,0,53.1,0,0,1,0,0,0,1
4,0,3,0,35.0,0,0,8.05,0,0,1,0,0,1,0


In [182]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Emb_1,Emb_2,Emb_3,Name_1,Name_2,Name_3,Name_4
0,3,0,34.5,0,0,7.8292,0,1,0,0,0,1,0
1,3,1,47.0,1,0,7.0,0,0,1,0,0,0,1
2,2,0,62.0,0,0,9.6875,0,1,0,0,0,1,0
3,3,0,27.0,0,0,8.6625,0,0,1,0,0,1,0
4,3,1,22.0,1,1,12.2875,0,0,1,0,0,0,1
