In [267]:
import pandas as pd
import numpy as np

In [304]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, confusion_matrix

In [287]:
from imblearn.combine import SMOTETomek
from collections import Counter

In [288]:
train_data = pd.read_csv(r'F:\Manipal -DS Course\Raw Data\Datasets\Titanic\train.csv')
train_data.head(2)
#train_data.shape

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [289]:
test_data = pd.read_csv(r'F:\Manipal -DS Course\Raw Data\Datasets\Titanic\test.csv')
test_data.head(2)
#test_data.shape

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [290]:
# Filling Missing Values: [Age Column]
train_data['Age'] = train_data['Age'].fillna(method='ffill')
test_data['Age'] = test_data['Age'].fillna(method='ffill')

In [291]:
p_name = train_data[['PassengerId','Name']]
y_sample = train_data['Survived']

train_drop = train_data.drop(['PassengerId','Survived','Name','Ticket','Fare','Cabin','Embarked'], axis=1)
train_dum = pd.get_dummies(train_drop, drop_first=True)
col = ['Pclass','Sex_male','Age','SibSp','Parch']
data_sample = train_dum.reindex(columns=col)
#data_sample = train_dum[['PassengerId','Pclass','Age','Sex_male','SibSp','Parch']]
data_sample.columns

Index(['Pclass', 'Sex_male', 'Age', 'SibSp', 'Parch'], dtype='object')

In [292]:
test_drop = test_data.drop(['PassengerId','Name','Ticket','Fare','Cabin','Embarked'], axis=1)
test_dum = pd.get_dummies(test_drop, drop_first=True)
col2 = ['Pclass','Sex_male','Age','SibSp','Parch']
test_sample = test_dum.reindex(columns=col2)
#data_sample = data_dum[['PassengerId','Pclass','Age','Sex_male','SibSp','Parch']]
test_sample.columns

Index(['Pclass', 'Sex_male', 'Age', 'SibSp', 'Parch'], dtype='object')

In [293]:
# null_value = train_data.isnull().sum()
# percent = (null_value/len(train_data))*100
# percent

In [298]:
# sns.heatmap(train_data.isnull(), cbar=False )

In [295]:
# mean = round(train_data[train_data['Sex']=='female'].mean(),0)
# mean['Age']
# # train_data['mean_fill'] = train_data['Age'].fillna(28)
# # train_data.to_csv('SAMPLE.csv')

In [296]:
# sns.heatmap(data_sample.corr(), annot= True, cmap='Reds', linewidths=0.2)

In [297]:
# count = pd.value_counts(y_sample)
# count.plot(kind='bar', rot=0)
# plt.xticks(range(2), ['Died', 'Survived'])
# plt.show()

# GridSearchCV:

In [306]:
log = LogisticRegression()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()
bag = BaggingClassifier()
boost = AdaBoostClassifier()

In [300]:
param_rf = {'n_estimators':[50,100,150,200], 'criterion':['gini','entropy'], 'max_depth':[3,4,5,6], 
             'max_features':['auto', 'sqrt', 'log2']} 

param_bag = {'n_estimators':[50,100,150,200]}

param_boost = {'n_estimators':[50,100,150,200], 'learning_rate':[0.2,0.3,0.4,0.5],
               'algorithm':['SAMME','SAMME.R']}

In [307]:
grid_model = GridSearchCV(estimator=boost, param_grid=param_boost, cv=5, verbose=0)
grid_model.fit(data_sample, y_sample)
print('Best Parameters are: ', grid_model.best_params_)
print('Best Training Score: ', grid_model.best_score_*100)

Best Parameters are:  {'algorithm': 'SAMME.R', 'learning_rate': 0.3, 'n_estimators': 50}
Best Training Score:  81.03195028560668


# Random Forrest:83

In [302]:
model_rf= RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=4, max_features='sqrt', random_state=48)
model_rf.fit(data_sample, y_sample)
pred_rf = model_rf.predict(test_sample)
print("Accuracy Score for RandomForestClassifier: ", model_rf.score(test_sample,pred_rf)*100)

Accuracy Score for RandomForestClassifier:  100.0


# Bagging:79.80


In [255]:
model_bag= BaggingClassifier(base_estimator=tree, n_estimators=200, random_state=48)
model_bag.fit(data_sample, y_sample)
pred_bag = model_bag.predict(test_sample)
print("Accuracy Score for BaggingClassifier: ", model_bag.score(test_sample,pred_bag)*100)

Accuracy Score for BaggingClassifier:  100.0


# Boosting:81

In [310]:
model_boost = AdaBoostClassifier(base_estimator=tree, n_estimators=50, learning_rate=0.3, algorithm='SAMME.R',
                                random_state=48)
model_boost.fit(data_sample,y_sample)
pred_boost = model_boost.predict(test_sample)
print("Accuracy Score for AdaBoostClassifier: ", model_boost.score(test_sample, pred_boost)*100)

Accuracy Score for AdaBoostClassifier:  100.0


# Prediction:

In [311]:
output = pd.DataFrame({'PassengerId':test_data['PassengerId'],'Survived': pred_boost})
#output
output.to_csv('naren_submission01.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!
