In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn.impute import SimpleImputer

from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=['O'])

# ANALYSIS (GROUP BY)

In [None]:
train_df.groupby(['Pclass'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['Sex','Pclass'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby('Pclass', as_index=False)['Survived'].count()

In [None]:
train_df.groupby('Pclass', as_index=False)['Survived'].agg('mean','count')

In [None]:
train_df.groupby(['Sex'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['SibSp'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['Parch'], as_index=False)['Survived'].mean()

In [None]:
#creaed new column -> Family size (feature engineering)
train_df['Family_Size']=train_df['SibSp']+train_df['Parch']+1
test_df['Family_Size']=train_df['SibSp']+train_df['Parch']+1

In [None]:
train_df.head()

In [None]:
train_df.groupby(['Family_Size'], as_index=False)['Survived'].mean()

In [None]:
family_map={1:'Small',2:'Small', 3: 'Small', 4:'Small', 5:'Medium', 6:'Medium',7:'Large',8:'Large',11:'Large'}
train_df['Family_Size_Grouped']=train_df['Family_Size'].map(family_map)
test_df['Family_Size_Grouped']=train_df['Family_Size'].map(family_map)


In [None]:
train_df.groupby(['Family_Size_Grouped'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['Embarked'], as_index=False)['Survived'].mean()

In [None]:
sns.displot(train_df,x='Age', col='Survived', binwidth=10, height=5)

In [None]:
train_df['Age_cut']= pd.qcut(train_df['Age'],8)
test_df['Age_cut']= pd.qcut(test_df['Age'],8)

In [None]:
train_df.groupby(['Age_cut'], as_index=False)['Survived'].mean()

In [None]:
train_df.loc[train_df['Age']<=16, 'Age']=0
train_df.loc[(train_df['Age']>16 )& (train_df['Age'] <=20.125), 'Age']=1
train_df.loc[(train_df['Age']>20.125) & (train_df['Age'] <=24.0), 'Age']=2
train_df.loc[(train_df['Age']>24.0 )& (train_df['Age'] <=28.0), 'Age']=3
train_df.loc[(train_df['Age']>28.0 )& (train_df['Age'] <=32.312), 'Age']=4
train_df.loc[(train_df['Age']>32.312) & (train_df['Age'] <= 38.0), 'Age']=5
train_df.loc[(train_df['Age']>38.0 )& (train_df['Age'] <=47.0), 'Age']=6
train_df.loc[(train_df['Age']>47.0 )& (train_df['Age'] <=80.0), 'Age']=7
train_df.loc[train_df['Age']>80, 'Age']

test_df.loc[test_df['Age']<=16, 'Age']=0
test_df.loc[(test_df['Age']>16 )& (test_df['Age'] <=20.125), 'Age']=1
test_df.loc[(test_df['Age']>20.125) & (test_df['Age'] <=24.0), 'Age']=2
test_df.loc[(test_df['Age']>24.0 )& (test_df['Age'] <=28.0), 'Age']=3
test_df.loc[(test_df['Age']>28.0 )& (test_df['Age'] <=32.312), 'Age']=4
test_df.loc[(test_df['Age']>32.312) & (test_df['Age'] <= 38.0), 'Age']=5
test_df.loc[(test_df['Age']>38.0 )& (test_df['Age'] <=47.0), 'Age']=6
test_df.loc[(test_df['Age']>47.0 )& (test_df['Age'] <=80.0), 'Age']=7
test_df.loc[test_df['Age']>80 , 'Age']

In [None]:
train_df.head(20)

In [None]:
sns.displot(train_df,x='Fare',col='Survived',binwidth=80,height=5)

In [None]:
train_df['Fare_cut']= pd.qcut(train_df['Fare'],6)
test_df['Fare_cut']= pd.qcut(test_df['Fare'],6)

In [None]:
train_df.groupby(['Fare_cut'], as_index=False)['Survived'].mean()

In [None]:
train_df.loc[train_df['Fare']<=7.775, 'Fare']=0
train_df.loc[(train_df['Fare']>7.775 )& (train_df['Fare'] <=8.662), 'Fare']=1
train_df.loc[(train_df['Fare']>8.662) & (train_df['Fare'] <=14.454), 'Fare']=2
train_df.loc[(train_df['Fare']>14.454 )& (train_df['Fare'] <=26.0), 'Fare']=3
train_df.loc[(train_df['Fare']>26.0)& (train_df['Fare'] <=52.369), 'Fare']=4
train_df.loc[(train_df['Fare']>52.369) & (train_df['Fare'] <= 512.329), 'Fare']=5
train_df.loc[train_df['Fare']>512.329, 'Fare']

test_df.loc[test_df['Fare']<=7.775, 'Fare']=0
test_df.loc[(test_df['Fare']>7.775 )& (test_df['Fare'] <=8.662), 'Fare']=1
test_df.loc[(test_df['Fare']>8.662) & (test_df['Fare'] <=14.454), 'Fare']=2
test_df.loc[(test_df['Fare']>14.454 )& (test_df['Fare'] <=26.0), 'Fare']=3
test_df.loc[(test_df['Fare']>26.0)& (test_df['Fare'] <=52.369), 'Fare']=4
test_df.loc[(test_df['Fare']>52.369) & (test_df['Fare'] <= 512.329), 'Fare']=5
test_df.loc[test_df['Fare']>512.329, 'Fare']

In [None]:
train_df['Name']

In [None]:
 train_df['Title']=train_df['Name'].str.split(pat=",",expand=True)[1].str.split(pat=".",expand=True)[0].apply(lambda x:x.strip())
 test_df['Title']=test_df['Name'].str.split(pat=",",expand=True)[1].str.split(pat=".",expand=True)[0].apply(lambda x:x.strip())


In [None]:
train_df.groupby(['Title'], as_index=False)['Survived'].mean()

In [None]:
#military - Capt, Col, Major
#noble - Jonkheer, the Countees, Don, Lady, Sir
#unmaried Female - Mlle, Ms, Mme

In [None]:
#it's a dictionary
train_df['Title']= train_df['Title'].replace({
    'Capt' : 'Military',
    'Col' : 'Military',
    'Major' : 'Military',
    'Jonkheer' : 'Noble',
    'the Countess':'Noble',
    'Don':'Noble',
    'Lady':'Noble',
    'Sir':'Noble',
    'Mlle':'Noble',
    'Ms':'Noble',
    'Mme':'Noble'
})

test_df['Title']= test_df['Title'].replace({
    'Capt' : 'Military',
    'Col' : 'Military',
    'Major' : 'Military',
    'Jonkheer' : 'Noble',
    'the Countess':'Noble',
    'Don':'Noble',
    'Lady':'Noble',
    'Sir':'Noble',
    'Mlle':'Noble',
    'Ms':'Noble',
    'Mme':'Noble'
})

In [None]:
train_df.groupby(['Title'], as_index=False)['Survived'].mean()

In [None]:
train_df['Name_Lenght'] = train_df['Name'].apply(lambda x: len(x))
test_df['Name_Lenght'] = test_df['Name'].apply(lambda x: len(x))


In [None]:
g=sns.kdeplot(train_df['Name_Lenght'][(train_df['Survived']==0) & (train_df['Name_Lenght'].notnull())],color='Red',fill=True)
g=sns.kdeplot(train_df['Name_Lenght'][(train_df['Survived']==1) & (train_df['Name_Lenght'].notnull())],ax=g,color='Blue',fill=True)
g.set_xlabel('Name_Lenght')
g.set_ylabel('Freauency')
g= g.legend(['Not Survived','Survived'])

In [None]:
train_df['Name_LenghtGB']= pd.qcut(train_df['Name_Lenght'],8)
test_df['Name_LenghtGB']= pd.qcut(test_df['Name_Lenght'],8)

In [None]:
train_df.groupby(['Name_LenghtGB'], as_index=False)['Survived'].mean()

In [None]:
train_df.loc[train_df['Name_Lenght']<=18, 'Name_Size']=0
train_df.loc[(train_df['Name_Lenght']>18 )& (train_df['Name_Lenght'] <=20), 'Name_Size']=1
train_df.loc[(train_df['Name_Lenght']>20) & (train_df['Name_Lenght'] <=23.0), 'Name_Size']=2
train_df.loc[(train_df['Name_Lenght']>23.0 )& (train_df['Name_Lenght'] <=25.0), 'Name_Size']=3
train_df.loc[(train_df['Name_Lenght']>25.0 )& (train_df['Name_Lenght'] <=27.25), 'Name_Size']=4
train_df.loc[(train_df['Name_Lenght']>27.25) & (train_df['Name_Lenght'] <= 30.0), 'Name_Size']=5
train_df.loc[(train_df['Name_Lenght']>30.0 )& (train_df['Name_Lenght'] <=38.0), 'Name_Size']=6
train_df.loc[(train_df['Name_Lenght']>38.0 )& (train_df['Name_Lenght'] <=82.0), 'Name_Size']=7
train_df.loc[train_df['Name_Lenght']>82, 'Name_Size']

test_df.loc[test_df['Name_Lenght']<=18, 'Name_Size']=0
test_df.loc[(test_df['Name_Lenght']>18 )& (test_df['Name_Lenght'] <=20), 'Name_Size']=1
test_df.loc[(test_df['Name_Lenght']>20) & (test_df['Name_Lenght'] <=23.0), 'Name_Size']=2
test_df.loc[(test_df['Name_Lenght']>23.0 )& (test_df['Name_Lenght'] <=25.0), 'Name_Size']=3
test_df.loc[(test_df['Name_Lenght']>25.0 )& (test_df['Name_Lenght'] <=27.25), 'Name_Size']=4
test_df.loc[(test_df['Name_Lenght']>27.25) & (test_df['Name_Lenght'] <= 30.0), 'Name_Size']=5
test_df.loc[(test_df['Name_Lenght']>30.0 )& (test_df['Name_Lenght'] <=38.0), 'Name_Size']=6
test_df.loc[(test_df['Name_Lenght']>38.0 )& (test_df['Name_Lenght'] <=82.0), 'Name_Size']=7
test_df.loc[test_df['Name_Lenght']>82, 'Name_Size']

In [None]:
train_df.head()

In [None]:
train_df['Ticket']

In [None]:
train_df['Ticket_Number']=train_df['Ticket'].apply(lambda x: pd.Series({'Ticket':x.split()[-1]}))
test_df['Ticket_Number']=test_df['Ticket'].apply(lambda x: pd.Series({'Ticket':x.split()[-1]}))

In [None]:
train_df['Ticket_Number']

In [None]:
train_df.groupby(['Ticket_Number'], as_index=False)['Survived'].agg(['count','mean']).sort_values('count', ascending=False)
