In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
#get data 
# first add data to notebook and then copy file path to pd.read_csv...
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')

**Data Analysis**

In [None]:
print("Columns of the train data")
print("")
print(train_df.columns.values)

#5 first rows of the train data

train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
#For more detailed information from each fields.

import pandas_profiling as pp
import warnings
warnings.filterwarnings('ignore')

In [None]:
pp.ProfileReport(train_df, title = 'Profiling report of "Train" set', html = {'style':{'full_width': True}})

In [None]:
pp.ProfileReport(test_df, title = 'Profiling report of "Test" set', html = {'style':{'full_width': True}})

In [None]:
train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
                                                                                  
#female with more % of survival than men!!

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)


# higher the class the best rate of survival!!

In [None]:
train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# to use an algorithm we need first to prepare the data, in this case replace (if possible) the nulls and all data must be in numeric type.Also remove attributes not 
#necessary for the model


train_df["Sex"].replace("male", 1, inplace = True)
train_df["Sex"].replace("female", 0, inplace = True)
train_df.head()

In [None]:
test_df["Sex"].replace("male", 1, inplace = True)
test_df["Sex"].replace("female", 0, inplace = True)
test_df.head()

**Data Preparation**

In [None]:
#fill the 2 nulls with most common port 

common_value = 'S'
train_df["Embarked"] = train_df["Embarked"].fillna(common_value)

In [None]:
# fill nulls of Age column with random numbers between [mean - std and mean + std]

data = [train_df, test_df]  #combine both dataframes

for dataset in data:
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)
    

train_df["Age"].isnull().sum()

In [None]:
test_df = test_df.fillna(test_df['Fare'].mean())

#fill the 1 value missing with mean of Fare

In [None]:
#check data


train_df.info()

In [None]:
test_df.info()

In [None]:
#drop unnecessary columns to the model

train_df=train_df.drop("PassengerId",axis=1)
train_df=train_df.drop("Name",axis=1)
train_df=train_df.drop("Ticket",axis=1)
train_df=train_df.drop("Cabin",axis=1)

In [None]:
test_df=test_df.drop("Name",axis=1)
test_df=test_df.drop("Ticket",axis=1)
test_df=test_df.drop("Cabin",axis=1)

In [None]:
#enconde embarked column  C = 0 ; Q = 1 ; S = 2 

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df["Embarked"]= le.fit_transform(train_df["Embarked"])
print(train_df["Embarked"])

In [None]:
train_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test_df["Embarked"]= le.fit_transform(test_df["Embarked"])
print(test_df["Embarked"])

In [None]:
test_df.head()

In [None]:
# in this case, i will use discrete data to put every data in the same scale to eliminate a possible bias.


combine = [train_df, test_df]

for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_df.head()

In [None]:
# create a new attribute with 2 columns

for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

In [None]:
train_df = train_df.drop(['Parch', 'SibSp'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp'], axis=1)
combine = [train_df, test_df]

train_df.head()

In [None]:
test_df.head()

*create new feature age*class*

In [None]:
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

In [None]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df.head()

In [None]:
#standardize the model for every variables have the same scale and contribute equally to the model fitting (μ=0, σ=1)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**Data Modelling**

In [None]:
# machine learning 
#choose these 3 models because this a Supervised Classification and regression problem:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1)
X_train.shape, Y_train.shape, X_test.shape

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

In [None]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
#random_forest = RandomForestClassifier(n_estimators=100)
#random_forest.fit(X_train, Y_train)

#Y_pred = random_forest.predict(X_test)

#random_forest.score(X_train, Y_train)
#acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

In [None]:
#acc_random_forest    # accuracy of 87.65 but kaggle score of 0.74162 V28

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#tuning hyperparameters

svm = SVC()
param_grid = {'C':[0.01,0.1,1, 10, 100, 1000],'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
grid = GridSearchCV(svm,param_grid, cv=5)

In [None]:
grid.fit(X_train, Y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.best_params_

In [None]:
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
classifier.score(X_train, Y_train)
classifier = round(classifier.score(X_train, Y_train) * 100, 2)
classifier

#Support Vector Classifier with the best kaggle score
#LOWEST accuracy 80.81 but best kaggle score 0.77751!!

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })

In [None]:
submission.to_csv('./submission.csv', index=False)