# 1. Import Libraries

In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
import itertools
import plotly.express as px
import itertools

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingRegressor, VotingClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')


# 2. Data Visualization

Load data from the train and test CSV files.

In [141]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [142]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [143]:
# train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

## Data Exploration

### Training data

In [144]:
train.head()

In [145]:
train.shape

In [146]:
train.columns

In [147]:
train.describe().T

In [148]:
train.describe(include=['O'])

In [149]:
train.info()

---

### Testing data

In [150]:
test.head()

In [151]:
test.shape

In [152]:
test.columns

In [153]:
test.describe().T

In [154]:
test.describe(include=['O'])

In [155]:
test.info()

### Observation
**Training data**
- Number of columns in train data: 12
- Number of rows in train data: 891

**Testing data**
- Number of columns in test data: 11
- Number of rows in test data: 418

---

# 3. Exploratory Data Analysis

In [156]:
sns.countplot(train['Survived']).set_title("TARGET DISTRIBUTION")

train['Survived'].value_counts()

---

### Correlational Matrix

In [157]:
corr = train.corr()
fig = px.imshow(corr, text_auto = True, aspect = 'auto')
fig.show()

### Pclass distribution

Pclass contains the data on ticket class whereby **1- first class, 2- second class, 3- third class**.

In [158]:
sns.set_style('whitegrid')
sns.countplot(x='Pclass',hue='Survived',data=train)

print(train['Pclass'].value_counts())

### Age distribution

In [159]:
# Figure size
plt.figure(figsize=(10,4))
# Histogram
sns.histplot(data=train, x='Age', hue='Survived', binwidth=1, kde=True)
plt.title('Age distribution')
plt.xlabel('Age (years)')

### Siblings & spouse on board distribution

In [160]:
plt.figure(figsize=(10, 5))

sns.set_style('whitegrid')
sns.countplot(x='SibSp',hue='Survived',data=train)

### Parch distribution

In [161]:
plt.figure(figsize=(10, 5))

sns.set_style('whitegrid')
sns.countplot(x='Parch',hue='Survived',data=train) #parent and children on board

---

### Sex distribution

In [162]:
sns.countplot(x='Sex', hue='Survived', data= train)

### Embarked distribution

In [163]:
train['Embarked'].value_counts()

sns.countplot(x='Survived',hue='Embarked',data=train) #c-cherbourg, Q-queentown, S-southampton

---

# Feature Engineering

### Converting Embarked to numerical

In [164]:
train['Embarked']=train['Embarked'].replace(['C','S','Q'],[1,2,3]) #c-cherbourg, Q-queentown, S-southampton
test['Embarked']=test['Embarked'].replace(['C','S','Q'],[1,2,3])

### Converting Sex to numerical

In [165]:
train['Sex']=train['Sex'].replace(['female','male'],[0,1])

test['Sex']=test['Sex'].replace(['female','male'],[0,1])
test.head()

### Bin age feature into groups

In [166]:
train['Age_group']=np.nan
train.loc[train['Age']<=12,'Age_group']='0-12'
train.loc[(train['Age']>12) & (train['Age']<18),'Age_group']='13-17'
train.loc[(train['Age']>=18) & (train['Age']<=25),'Age_group']='18-25'
train.loc[(train['Age']>25) & (train['Age']<=30),'Age_group']='26-30'
train.loc[(train['Age']>30) & (train['Age']<=50),'Age_group']='31-50'
train.loc[train['Age']>50,'Age_group']='51+'

In [167]:
test['Age_group']=np.nan
test.loc[test['Age']<=12,'Age_group']='0-12'
test.loc[(test['Age']>12) & (test['Age']<18),'Age_group']='13-17'
test.loc[(test['Age']>=18) & (test['Age']<=25),'Age_group']='18-25'
test.loc[(test['Age']>25) & (test['Age']<=30),'Age_group']='26-30'
test.loc[(test['Age']>30) & (test['Age']<=50),'Age_group']='31-50'
test.loc[test['Age']>50,'Age_group']='51+'


In [168]:
plt.figure(figsize=(10,4))
g=sns.countplot(data=train, x='Age_group', hue='Survived', order=['0-12','13-17','18-25','26-30','31-50','51+'])
plt.title('Age group distribution')

### Bin age feature into groups

In [169]:
train['Fare_range']=np.nan
train.loc[train['Fare']<=19,'Fare_range']='0-19'
train.loc[(train['Fare']>=20) & (train['Fare']<=39),'Fare_range']='20-39'
train.loc[(train['Fare']>=40) & (train['Fare']<=59),'Fare_range']='40-59'
train.loc[(train['Fare']>=60) & (train['Fare']<=79),'Fare_range']='60-79'
train.loc[(train['Fare']>=80) & (train['Fare']<=99),'Fare_range']='80-99'
train.loc[(train['Fare']>=100) & (train['Fare']<=199),'Fare_range']='100-199'
train.loc[(train['Fare']>=200) & (train['Fare']<=299),'Fare_range']='200-299'
train.loc[(train['Fare']>=300) & (train['Fare']<=399),'Fare_range']='300-399'
train.loc[(train['Fare']>=400) & (train['Fare']<=499),'Fare_range']='400-499'
train.loc[train['Fare']>=501,'Fare_range']='501+'

In [170]:
test['Fare_range']=np.nan
test.loc[test['Fare']<=19,'Fare_range']='0-19'
test.loc[(test['Fare']>=20) & (test['Fare']<=39),'Fare_range']='20-39'
test.loc[(test['Fare']>=40) & (test['Fare']<=59),'Fare_range']='40-59'
test.loc[(test['Fare']>=60) & (test['Fare']<=79),'Fare_range']='60-79'
test.loc[(test['Fare']>=80) & (test['Fare']<=99),'Fare_range']='80-99'
test.loc[(test['Fare']>=100) & (test['Fare']<=199),'Fare_range']='100-199'
test.loc[(test['Fare']>=200) & (test['Fare']<=299),'Fare_range']='200-299'
test.loc[(test['Fare']>=300) & (test['Fare']<=399),'Fare_range']='300-399'
test.loc[(test['Fare']>=400) & (test['Fare']<=499),'Fare_range']='400-499'
test.loc[test['Fare']>=501,'Fare_range']='501+'

In [171]:
plt.figure(figsize=(20,4))

g=sns.countplot(data=train, x='Fare_range', hue='Survived', order=['0-19','20-39','40-59','60-79','80-99','100-199','200-299','300-399','400-499','501+'])
plt.title('Age group distribution')

In [None]:
# Joint distribution of HomePlanet and Destination
HPD_gb=train.groupby(['Fare','Ticket'])['Ticket'].size().unstack().fillna(0)

# Heatmap of missing values
plt.figure(figsize=(10,4))
sns.heatmap(HPD_gb.T, annot=True, fmt='g', cmap='coolwarm')

# Preprocessing

### Missing values

In [None]:
sns.heatmap(train.isnull(),yticklabels=False, cbar=False, cmap='viridis')

In [None]:
sns.heatmap(test.isnull(),yticklabels=False, cbar=False, cmap='viridis')

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
df_test=pd.read_csv('test.csv')
test_id=df_test["PassengerId"]
df_test['Age']=df_test[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
train.drop('Cabin',axis=1,inplace=True)
df_test.drop('Cabin',axis=1,inplace=True)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
sns.heatmap(df_test.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
train.info()
train['Sex'].unique()
train['Embarked'].unique()

In [None]:
df_test.info()

In [None]:
Sex=pd.get_dummies(train['Sex'], drop_first=True)
Sex_test=pd.get_dummies(df_test['Sex'],drop_first=True)
Embarked=pd.get_dummies(train['Embarked'],drop_first=True)
Embarked_test=pd.get_dummies(df_test['Embarked'],drop_first=True)

In [None]:
df_test.drop(['Name','Sex','Embarked','Ticket'],axis=1,inplace=True)
df_test.head

In [None]:
train.drop(['Name','Sex','Embarked','Ticket'],axis=1,inplace=True)
train.head()

In [None]:
df_test=pd.concat([df_test,Sex_test,Embarked_test],axis=1)
df_test.head()

In [None]:
df_test.info

In [None]:
df=pd.concat([df,Sex,Embarked],axis=1)
df.head()

In [None]:
df=df[["PassengerId","Pclass","Age","SibSp","Parch","Fare","male","Q","S","Survived"]]
df.head()

In [None]:
x=df.drop('Survived',axis=1)
y=df.Survived
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)
models=[LinearSVC(), SVC(kernel='rbf'),KNeighborsClassifier(),RandomForestClassifier(),DecisionTreeClassifier(),GradientBoostingClassifier(),GaussianNB()]
model_names=['LinearSVM','rbfSVM','KNeighbors','RandoomForestClassifier','DecisionTree','GradientBoostingClassifier','GaussianNb']
accu=[]

for model in range(len(models)):
    clf=models[model]
    clf.fit(x_train, y_train)
    pred=clf.predict(x_test)
    accu.append(accuracy_score(pred,y_test))
models={'model':model_names, 'Accuracy':accu}

model_df=pd.DataFrame(models)

model_df

In [None]:
def feature_scaling(x_train, x_test, y_train, y_test, name_scaler):
    models=[LinearSVC, SVC(kernel='rbf'), KNeighborsClassifier(), RandomForestClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier(), GaussianNB()]
    accu_score=[]
    for i in range(len(models)):
        clf=models[model]
        clf.fit(x_train, y_train)
        pred=clf.predict(x_test)
        accu_score.append(accuracy_score(pred, y_test))
    model_df[name_scaler]=np.array(accu_score)
    

In [None]:
scalers=[MinMaxScaler(), StandardScaler()]
names=['Accuracy_MinMax','Accuracy_Standard']
for i in range(len(scalers)):
    scaler=scalers[i]
    scaler.fit(df)
    scaled_df=scaler.transform(df)
    x=scaled_df[:, 0:9]
    Y=df.Survived.to_numpy()
    x_train, x_test, y_train, y_test=train_test_split(x,Y, test_size=0.2, random_state=42)
    feature_scaling(x_train, x_test, y_train, y_test, names[i])
model_df

In [None]:
scaler=MinMaxScaler()
x=df.drop('Survived',axis=1)
y=df.Survived
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
model, test_accuracy=[], []

In [None]:
#KNN model

params_dict={'n_neighbors': [i+1 for  i in range(50)], 'n_jobs':[-1]}
Knn_clf=RandomizedSearchCV(estimator=KNeighborsClassifier(),param_distributions=params_dict, scoring='accuracy', cv=10)
Knn_clf.fit(x_train, y_train)

In [None]:
Knn_clf.best_params_

In [None]:
Knn_clf.best_score_

In [None]:
pred=Knn_clf.predict(x_test)
accuracy=accuracy_score(y_test, pred)

model.append('KNN')
test_accuracy.append(accuracy)
print("KNN Accuracy:", accuracy)

In [None]:
#SVM model

params_dict={'C':[0.1, 1, 10, 100, 1000], 'gamma':[1,0.1,0.01,0.001,0.0001],'kernel':['linear','rbf']}
svm_clf=RandomizedSearchCV(estimator=SVC(), param_distributions=params_dict,scoring='accuracy',cv=10)
svm_clf.fit(x_train, y_train)

In [None]:
svm_clf.best_params_

In [None]:
svm_clf.best_score_

In [None]:
pred=svm_clf.predict(x_test)
accuracy=accuracy_score(y_test, pred)

model.append('SVM')
test_accuracy.append(accuracy)
print("SVM Accuracy:", accuracy)

In [None]:
# deicision tree model

params_dict={'criterion': ['gini','entropy'], 'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
dt_clf=RandomizedSearchCV(estimator=DecisionTreeClassifier(), param_distributions=params_dict, scoring='accuracy', cv=10)
dt_clf.fit(x_train, y_train)

In [None]:
dt_clf.best_params_

In [None]:
dt_clf.best_score_

In [None]:
pred=dt_clf.predict(x_test)
accuracy=accuracy_score(y_test, pred)

model.append('Decision Tree')
test_accuracy.append(accuracy)
print("Decision Tree Accuracy", accuracy)

In [None]:
#random forest model

params_dict={'n_estimators':[100,200,300,400,500], 'max_features':['auto','sqrt','log2']}
rf_clf=RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1), param_distributions=params_dict, scoring='accuracy', cv=10)
rf_clf.fit(x_train, y_train)

In [None]:
rf_clf.best_params_

In [None]:
rf_clf.best_score_

In [None]:
pred=rf_clf.predict(x_test)
accuracy=accuracy_score(y_test, pred)

model.append('Random Forest')
test_accuracy.append(accuracy)
print("Random Forest Accuracy:", accuracy)

In [None]:
params_dict={'n_estimators':[100,200,300,400,500,600,700]}
gb_clf=RandomizedSearchCV(estimator=GradientBoostingClassifier(), param_distributions=params_dict, cv=10)
gb_clf.fit(x_train, y_train)

In [None]:
gb_clf.best_params_

In [None]:
gb_clf.best_score_

In [None]:
pred=gb_clf.predict(x_test)
accuracy=accuracy_score(y_test, pred)

model.append('GardientBoosting')
test_accuracy.append(accuracy)
print("GradientBoosting Accuracy:",accuracy)

In [None]:
#adaboost classifier

params_dict={'n_estimators': list(range(1,201,20))}
ada_clf=GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(criterion='entropy',max_depth=100), algorithm='SAMME.R'), param_grid=params_dict, cv=10)
ada_clf.fit(x_train, y_train)

In [None]:
ada_clf.best_params_

In [None]:
ada_clf.best_score_

In [None]:
pred=ada_clf.predict(x_test)
accuracy=accuracy_score(y_test, pred)

model.append('AdaboostClassifier')
test_accuracy.append(accuracy)
print("AdaBoost Classifier accuracy:",accuracy)

In [None]:
#final evaluation
final=pd.DataFrame({'model':model,'Test Accuracy': test_accuracy})

In [None]:
final

In [None]:
df_test.isnull().any() #ckeck for null values

In [None]:
df_test['Fare']=df_test['Fare'].fillna((df_test['Fare'].mean())) #this replaces all the null cells with the mean

In [None]:
df_test.isnull().any() #to check if it worked

In [None]:
submission_pred=clf.predict(df_test)
df=pd.DataFrame({"PassengerId":test_id.values,"Survived":submission_pred,})

In [None]:
df.to_csv("Submission.csv",index=False) #save the predictions in the csv file

In [None]:
df=pd.read_csv('Submission.csv')
df.info()

In [None]:
df