# Introduction

In [None]:
# Importations

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_theme() 


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  classification_report
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
from sklearn import model_selection
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier


In [None]:
# Upload 2 files train & test

train = pd.read_csv ('../input/train.csv',
                          sep=',', header=0, index_col=1,error_bad_lines=False)


valid = pd.read_csv ('../input/test.csv',
                          sep=',', header=0, index_col=1,error_bad_lines=False)


In [None]:
#region='France'
region='{{cookiecutter.region}}'

In [None]:
train.info()
#train=train[train['region']=='France']
train=train[train['region']==region]
train.drop(['region'],axis=1,inplace=True)
train.head()

In [None]:
valid.info()
#valid=valid[valid['region']=='France']
valid=valid[valid['region']==region]
valid.drop(['region'],axis=1,inplace=True)
valid.head()

In [None]:
# We can see some missing values in both files


In [None]:
# We concatenate train and valid

train['df']='train'
valid['df']='valid'

df = pd.concat([train,valid] , axis=0)


In [None]:
df.info()



In [None]:

# We cand drop 3 features : Cabin,Name and Ticket

df = df.drop(['Cabin', 'Name','Ticket'], axis=1)

# Statistiques

In [None]:
# NUM

num_col=['Age','Fare']
for col in num_col :  
    print(col, ":", df[col].describe())


In [None]:
# CAT

for col in df.columns : 
    if col not in num_col : 
        print(col, ":", df[col].value_counts())

In [None]:
df.info()

In [None]:
p=sns.countplot(x='Survived',  data=train)
p.set(title = "Variable survival "+region)
plt.gcf().savefig('../../output/survival.png')
plt.show(block=False)

In [None]:
# Distribution of Age

p=sns.displot(df['Age'], bins=15, kind='hist', kde=1)
p.set(title = "Distribution of the variable age "+region)
plt.gcf().savefig('../../../output/age_hist.png')
plt.show(block=False)

In [None]:

p=sns.boxplot( y='Age', x='Survived', data=df)
p.set(title = "Age distribution as a function of survival "+region)
plt.gcf().savefig('../../../output/age_survival.png')
plt.show(block=False)

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=df)
p.set(title = "Crossing between class and survival "+region)
plt.gcf().savefig('../../../output/class_survival.png')
plt.show(block=False)

In [None]:
sns.countplot(x='Sex', hue='Survived', data=df)
p.set(title = "Cross between gender and survival "+region)
plt.gcf().savefig('../../../output/gender_survival.png')
plt.show(block=False)

In [None]:
sns.countplot(x='Embarked', hue='Survived', data=train)
p.set(title = "Crossing between embarkation and survival "+region)
plt.gcf().savefig('../../../output/embark_survival.png')
plt.show(block=False)

# Preprocessing

In [None]:
# Fill missing values : 

# AGE : on remplace par la moyenne
df['Age'].replace( { np.NaN : df['Age'].median() } , inplace=True)


# EMBARKED : par le mode
df['Embarked'].replace( { np.NaN :df['Embarked'].mode()[0] } , inplace=True)


# Fare : moyenne
df['Fare'].replace( { np.NaN : df['Fare'].median() } , inplace=True)


In [None]:
df.info()

In [None]:
# Get Dummy

df_discret = pd.get_dummies(data = df )

df_discret.shape

In [None]:
df_discret.head()

In [None]:
# We split the 2 dataframes


train2 = df_discret[df_discret['df_train']==1]
valid2 = df_discret[df_discret['df_valid']==1]


print(train2.shape)
print(valid2.shape)

In [None]:
# We drop these columns


train2 = train2.drop(['df_train', 'df_valid'], axis=1)

valid2 = valid2.drop(['df_train', 'df_valid','Survived'], axis=1)

In [None]:
# Train test split

target = train2['Survived']
features= train2.drop('Survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size = 0.2, random_state=50)


In [None]:
# StandardScaler

scaler= StandardScaler()

X_train_scaled  = scaler.fit_transform (X_train)

X_test_scaled = scaler.transform ( X_test)

valid_scaled = scaler.transform ( valid2)

# Creation of 4 models


In [None]:
# Logistic Regression

clf_lr= LogisticRegression().fit(X_train_scaled,y_train)

y_pred_test_lr = clf_lr.predict(X_test_scaled)


print("Results on test")
print(pd.crosstab(y_test, y_pred_test_lr, rownames=['Truth'], colnames=['Prediction']))
print (classification_report(y_test, y_pred_test_lr))


In [None]:
# KNN (5 Neighbors)

knn =  neighbors.KNeighborsClassifier()

knn.fit(X_train_scaled,y_train)

y_pred_test_knn = knn.predict(X_test_scaled)

print("Results on test")
print(pd.crosstab(y_test, y_pred_test_knn, rownames=['Truth'], colnames=['Prediction']))
print (classification_report(y_test, y_pred_test_knn))

In [None]:
# Random Forest (100 trees)
 
rf = ensemble.RandomForestClassifier()

rf.fit(X_train_scaled,y_train)

y_pred_test_rf = rf.predict(X_test_scaled)

print("Results on test")
print(pd.crosstab(y_test, y_pred_test_rf, rownames=['Truth'], colnames=['Prediction']))
print (classification_report(y_test, y_pred_test_rf))


In [None]:
# GradientBoostingClassifier ( 100 estimators)

clf = GradientBoostingClassifier()

clf.fit(X_train_scaled,y_train)

y_pred_test_gbc = clf.predict(X_test_scaled)

print("Results on test")
print(pd.crosstab(y_test, y_pred_test_gbc, rownames=['Truth'], colnames=['Prediction']))
print (classification_report(y_test, y_pred_test_gbc))


In [None]:
# Maybe we can improve the performance with a Voting classifier combining  the 4 models


vc = VotingClassifier([('lr', clf_lr),('knn', knn), ('rf', rf), ('clf', clf)])

vc.fit(X_train_scaled,y_train)

y_pred_test_vc = vc.predict(X_test_scaled)

print(pd.crosstab(y_test, y_pred_test_vc, rownames=['Truth'], colnames=['Prediction']))
print (classification_report(y_test, y_pred_test_vc))

# Conclusion

In [None]:

print("Score - régression logistique :",clf_lr.score(X_test_scaled, y_test))
print("Score - KNN :",knn.score(X_test_scaled, y_test))
print("Score - random forest :",rf.score(X_test_scaled, y_test))
print("Score - gradient boosting classifier:",clf.score(X_test_scaled, y_test))
print("Score - voting classifier :",vc.score(X_test_scaled, y_test))




# Kaggle Submission 

In [None]:
prediction_final = vc.predict(valid_scaled)
valid2['Survived']=prediction_final
valid2['Survived']=valid2['Survived'].astype('int64')
valid2=valid2.reset_index()


submission = pd.DataFrame({'PassengerId':valid2['PassengerId'],'Survived':valid2['Survived']})

import datetime;
  
# ct stores current time
ct = datetime.datetime.now()

ct=str(ct)

ct=ct.replace(' ','_')
ct=ct.replace('.','_')
ct=ct.replace(':','_')


import os

path = '../output/'+region

# Check whether the specified path exists or not
isExist = os.path.exists(path)

if not isExist:
  
  # Create a new directory because it does not exist 
  os.makedirs(path)
  print("The new directory is created!")

#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = path+'/Titanic Predictions vc '+str(ct)+'.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)



In [None]:
valid2.to_csv('../../../output/output.csv',index=False)