In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pickle


In [2]:
url = "datasets/titanic_data.csv"
data = pd.read_csv(url)

In [3]:
df = data.copy()
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
df.shape

(891, 12)

In [6]:
# file 2
df2 = pd.read_csv('./datasets/titanic.csv')
df2.tail(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [7]:
df2.shape

(891, 15)

In [8]:
df2.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [10]:
# features of interest 

In [11]:
cols = ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone']

cols_to_keep = ['survived', 'pclass', 'sex', 'age', 'fare', 'embarked', 'class']

df2 = df2[cols_to_keep]

In [12]:
df2.dtypes

survived      int64
pclass        int64
sex          object
age         float64
fare        float64
embarked     object
class        object
dtype: object

In [13]:
# remove Nans
df2.isnull().sum()

survived      0
pclass        0
sex           0
age         177
fare          0
embarked      2
class         0
dtype: int64

In [14]:
df2["age"] = df2.age.fillna(df2.age.mean())

In [15]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       891 non-null    float64
 4   fare      891 non-null    float64
 5   embarked  889 non-null    object 
 6   class     891 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 48.9+ KB


In [16]:
df2.isnull().sum()

survived    0
pclass      0
sex         0
age         0
fare        0
embarked    2
class       0
dtype: int64

In [17]:
df2.embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [18]:
df2.embarked.dropna(inplace=True)

In [19]:
df2[df2["embarked"] == np.nan]

Unnamed: 0,survived,pclass,sex,age,fare,embarked,class


In [20]:
df2[df2["embarked"] == np.nan]

Unnamed: 0,survived,pclass,sex,age,fare,embarked,class


In [21]:
df2=df2.dropna(subset=['embarked'])

In [22]:
df2.embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

In [23]:
df2.head(3)

Unnamed: 0,survived,pclass,sex,age,fare,embarked,class
0,0,3,male,22.0,7.25,S,Third
1,1,1,female,38.0,71.2833,C,First
2,1,3,female,26.0,7.925,S,Third


In [24]:
df2.columns

Index(['survived', 'pclass', 'sex', 'age', 'fare', 'embarked', 'class'], dtype='object')

In [25]:
# train test split 

In [26]:
X = df2.drop("survived", axis=1)
y = df2.survived

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
# Using train_test_split to Split Data into Training and Testing Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)

In [29]:
# df = pd.DataFrame(y)
# print(df.value_counts())

In [30]:
df2.dtypes

survived      int64
pclass        int64
sex          object
age         float64
fare        float64
embarked     object
class        object
dtype: object

In [31]:
# sex, embarked and class columns are objects, they will be encoded

In [32]:
# Encoders 
from sklearn.preprocessing import LabelEncoder
le_sex = LabelEncoder()
X_train['sex'] = le_sex.fit_transform(X_train['sex'])
X_train['sex'].unique()

array([0, 1])

In [33]:
le_embarked = LabelEncoder()
X_train['embarked'] = le_embarked.fit_transform(X_train['embarked'])
X_train['embarked'].unique()

array([0, 2, 1])

In [34]:
le_class = LabelEncoder()
X_train['class'] = le_class.fit_transform(X_train['class'])
X_train['class'].unique()

array([2, 0, 1])

In [35]:
# le_alone = LabelEncoder()
# X_train['alone'] = le_alone.fit_transform(X_train['alone'])
# X_train['alone'].unique()

In [36]:
# Regression. Random Forest Regressor 

In [37]:
from sklearn.ensemble import RandomForestRegressor
rand_forest_regressor = RandomForestRegressor()

In [38]:
rand_forest_regressor.fit(X_train,y_train.values)

In [39]:
rand_forest_regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [40]:
y_pred = rand_forest_regressor.predict(X_train)

In [41]:
from sklearn.metrics import mean_squared_error

error = np.sqrt(mean_squared_error(y_train, y_pred))
print("{:.02f}".format(error))

0.17


# GridCVSearch

In [42]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor =  DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring="neg_mean_squared_error")
gs.fit(X_train, y_train.values)

In [43]:
regresor_best = gs.best_estimator_

regresor_best.fit(X_train, y_train.values)
y_pred = regresor_best.predict(X_train)
error = np.sqrt(mean_squared_error(y_train, y_pred))
print("{:.02f}".format(error))

0.34


## Testing with test set

In [44]:
X_test.head()

Unnamed: 0,pclass,sex,age,fare,embarked,class
811,3,male,39.0,24.15,S,Third
514,3,male,24.0,7.4958,S,Third
786,3,female,18.0,7.4958,S,Third
591,1,female,52.0,78.2667,C,First
561,3,male,40.0,7.8958,S,Third


In [45]:
X_test['sex'] = le_sex.fit_transform(X_test['sex'])
X_test['embarked'] = le_embarked.fit_transform(X_test['embarked'])
X_test['class'] = le_class.fit_transform(X_test['class'])
# X_test['alone'] = le_alone.fit_transform(X_test['alone'])

In [46]:
y_pred = regresor_best.predict(X_test)

In [47]:
error = np.sqrt(mean_squared_error(y_test, y_pred))
print("{:.02f}".format(error))

0.37


In [48]:
# random forest model 
y_pred_2 = rand_forest_regressor.predict(X_test)

In [49]:
error = np.sqrt(mean_squared_error(y_test, y_pred_2))
print("{:.02f}".format(error))

0.38


In [50]:
# new value

In [51]:
X_test.columns

Index(['pclass', 'sex', 'age', 'fare', 'embarked', 'class'], dtype='object')

In [52]:
# 'pclass', 'sex', 'age', 'fare', 'embarked', 'class'

new_value = [2, 'female', 35, 70, 'S', 'First']

In [53]:
new_value

[2, 'female', 35, 70, 'S', 'First']

In [54]:
# convert to an array
nw_val_arr = np.array([new_value]) 

In [55]:
nw_val_arr

array([['2', 'female', '35', '70', 'S', 'First']], dtype='<U11')

In [56]:
print(nw_val_arr[:,0])
print(nw_val_arr[:,1])
print(nw_val_arr[:,2])
print(nw_val_arr[:,3])
print(nw_val_arr[:,4])
print(nw_val_arr[:,5])

['2']
['female']
['35']
['70']
['S']
['First']


In [57]:
# do the label encoding 
nw_val_arr[:,1] = le_sex.transform(nw_val_arr[:,1])
nw_val_arr[:,4] = le_embarked.transform(nw_val_arr[:,4])
nw_val_arr[:,5] = le_class.transform(nw_val_arr[:,5])
nw_val_arr = nw_val_arr.astype(float)
nw_val_arr

array([[ 2.,  0., 35., 70.,  2.,  0.]])

In [58]:
# predict

In [59]:
y_pred_3 = rand_forest_regressor.predict(nw_val_arr)



In [60]:
y_pred_3

array([1.])

In [61]:
# 1 means survived, 0 means died