In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_train.shape

In [None]:
df_train.head(10)

In [None]:
#Find columns with Nans in them
print("Columns with missing values: ")
print(df_train.columns[df_train.isnull().any()].tolist())

In [None]:
print(df_train.isnull().sum())

There are 3 ways to handle null values

1.Remove NaN rows
  2.Set NaN to hard coded value
  3.Impute NaN values based on other rows  
  here, for Embarked we did 1

In [None]:
print("Before dropping - " + str(len(df)) + " rows")
df_train = df_train[~df_train['Embarked'].isna()]
print("After dropping - " + str(len(df)) + " rows")

For Cabin we will do 3. Impute NaN values based on other rows.

In [None]:
df_train['Cabin'].value_counts()

In [None]:
most_frequent_cabin = df_train['Cabin'].value_counts().idxmax()
print("Most Frequent Cabin = " + most_frequent_cabin)
print("NA count before fill = " + str(len(df_train[df_train['Cabin'].isna()])))
df_train['Cabin'] = df_train['Cabin'].fillna(most_frequent_cabin)
print("NA count after fill = " + str(len(df_train[df_train['Cabin'].isna()])))

Since Age is numerical, let us impute its value by filling it with the mean age of passengers on the ship.

In [None]:
mean_age = df_train['Age'].mean()
print("Mean age of passengers = " + str(mean_age))
print("NA count before fill = " + str(len(df_train[df_train['Age'].isna()])))
df_train['Age'] = df_train['Age'].fillna(mean_age)
print("NA count after fill = " + str(len(df_train[df_train['Age'].isna()])))

In [None]:
#statistical distribution
df_train.describe()

In [None]:
hist = df_train.hist(figsize=(10,10),layout=(3,4))

In [None]:
sns.pairplot(df_train)
plt.show()

Correlation between data

In [None]:

corr=df_train.corr()

corr.style.background_gradient(cmap='coolwarm')


Correaltions observed in Titanic:

Pclass and Fare
Age and Parch
Age and SibSp
Age and Pclass
Pclass and Survived

In [None]:
df_train['RelativeCount'] = df_train['SibSp'] + df_train['Parch']
df_train['RelativeCount'].describe()

In [None]:
corr=df_train.corr()

corr.style.background_gradient(cmap='coolwarm')

In [None]:
df_train.head(100)

In [None]:
hist = df_train.hist(figsize=(10,10),column='RelativeCount')

**Was a passenger travelling alone?**

numpy.where(): Return elements chosen from x or y depending on condition.
https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html

In [None]:
df_train['TravelAlone'] = np.where(df_train['SibSp']+df_train['Parch']>0, "No", "Yes")
df_train.head()

In [None]:
df_train['TravelAlone'].value_counts()

See types of plots_documentations pandas

In [None]:
#Since its too much data to visualize and understand, lets bucket the age as below
#feature engineering FTW!
def bucket_age(age):
  if age < 15:
    return "<15"
  if age >= 15 and age < 30:
    return "15-30"
  if age >=30 and age < 45:
    return "30-45"
  if age>=45 and age < 60:
    return "45-60"
  return ">60"

df_train['AgeBucket'] = df_train['Age'].apply(bucket_age)

#Visualize this with a pie chart
pie = df_train['AgeBucket'].value_counts().plot(kind="pie",title='AgeBucket Distribution',legend=True,autopct='%1.1f%%')

Box Plots
A boxplot is a standardized way of displaying the distribution of data based on a five number summary (“minimum”, first quartile (Q1), median, third quartile (Q3), and “maximum”). It can tell you about your outliers and what their values are. It can also tell you if your data is symmetrical, how tightly your data is grouped, and if and how your data is skewed.


Reference

https://www.khanacademy.org/math/statistics-probability/summarizing-quantitative-data/box-whisker-plots/a/box-plot-review
https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51

In [None]:
##### Box Plots

#A boxplot is a standardized way of displaying the distribution of data based on a five number summary (“minimum”, first quartile (Q1), median, third quartile (Q3), and “maximum”). It can tell you about your outliers and what their values are. It can also tell you if your data is symmetrical, how tightly your data is grouped, and if and how your data is skewed.

#![alt text](https://miro.medium.com/max/18000/1*2c21SkzJMf3frPXPAR_gZA.png)

#Reference 
#- https://www.khanacademy.org/math/statistics-probability/summarizing-quantitative-data/box-whisker-plots/a/box-plot-review
#- https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51


**Sex and Fare Relationship**

In [None]:
sns.catplot(x="Sex", y="Fare", kind="box", data=df_train)

How does the Pclass relate to the Survival rate?

In [None]:
#Lets start off with a bar plot of the Pclass distribution
pie = df_train['Pclass'].value_counts().plot(kind="pie",title='PClass Distribution',legend=True,autopct='%1.1f%%')

In [None]:
sns.catplot(x="RelativeCount", y="AgeBucket", kind="box", data=df_train)

In [None]:
sns.catplot(x="RelativeCount", y="Age", kind="box", data=df_train)


In [None]:
df_train.head(5)


In [None]:
df_train.head(10)


In [None]:
# NAME

# Name Length
# I Can't really Explain why but name length has a clear pattern and seems to significantly improve the
# result and hence I am obliged to use it in my Code
df_train['Name_Length'] = df_train['Name'].apply(lambda x : len(x))
df_train['Name_Length'] = ((df_train.Name_Length)/15).astype(np.int64)+1
print(df_train[['Name_Length','Survived']].groupby(['Name_Length'], as_index = False).mean())
plt.subplots(figsize=(15, 6))
sns.barplot(data=df_train,x='Name_Length',y='Survived')


In [None]:
# You may now think that Name is a useless column but Name contains somethings very important,'Titles'
# If You observe closely you will notice that all names have a Title, example : 'MR','Mrs','Cpt',etc

# EXTRACTING TITLE FORM NAME
title = df_train.Name.values
import re
for i in range(len(title)):
    r = re.search(', ([A-Za-z ]*)',title[i])
    title[i] = r.group(1)
df_train.loc[:,'Name'] = title 
plt.subplots(figsize=(15, 6))
sns.barplot(data=df_train,x='Name',y='Survived')
# Hence from the figure below show that it may play an important role in the decision making process

In [None]:
# Now We have completed Phase 1 of the Code that is Preprocesing Infromation
# the 2nd Phase of the code is to encode the data and and dropiing unwanted columns

# Encoding String values to Numbers
from sklearn.preprocessing import LabelEncoder

#SEX
lb_Sex = LabelEncoder()
df_train['Sex'] = lb_Sex.fit_transform(df_train.Sex)



In [None]:

#TITLE
lb_Title = LabelEncoder()
df_train['Name'] = lb_Title.fit_transform(df_train.Name)

# DROPPING THE EXTRA COLUMNS
df_train.drop(labels=['SibSp','Parch','Ticket','Fare','Age','PassengerId','Cabin',],axis=1,inplace=True)




In [None]:
df_train.head()


In [None]:
df_train.drop(labels=['Embarked'],axis=1,inplace=True)

In [None]:
df_train.head()


In [None]:
#RETREIVING THE TEST AND THE TRAIN SETS
df_test = df_train[df_train.Survived.isnull()]
df_train = df_train[df_train.Survived.notnull()]

df_test = df_test.drop(['Survived'],axis=1)

In [None]:
#DIVIDING THE DATA INTO Y_TRAIN AND X_TRAIN AND CONVERTING THEM INTO NP ARRAYS
y_train = df_train.loc[:,'Survived'].values
x_train =df_train.drop(['Survived'],axis=1).values
x_test = df_test.values

In [None]:
# Feature Scaling 
from sklearn.preprocessing import MinMaxScaler
sc_x = MinMaxScaler((-1,1))
x_train  = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)


In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
dict_K = {}
dic = {}

#Kfold Validation
def get_acc(Xtrain,Ytrain,model):
    from sklearn.model_selection import KFold
    acc = []
    k=KFold(n_splits=4)
    for train , test in k.split(Xtrain,y=Ytrain):
        x_train = Xtrain[train,:]
        y_train = Ytrain[train]
        x_test = Xtrain[test,:]
        y_test = Ytrain[test]
        model.fit(x_train,y_train)
        y_pred = model.predict(x_test)
        cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
        acc.append((cm[1,1]+cm[0,0])/((cm[1,0]+cm[0,1]+cm[1,1]+cm[0,0])+1e-5))
    return acc

In [None]:
df_train.head()

In [1]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=25,criterion='entropy')
dict_K['Random_forest'] = get_acc(x_train,y_train,classifier)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [None]:
# Preparing the CSV For Submition
p = dataset_gd.PassengerId
p = pd.concat([p,pd.DataFrame(y_pred.astype(np.int64),columns=['Survived'])],axis=1)
p.to_csv('Tit_pred.csv',index=False)