In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')


In [None]:
train.isna().any()
#there are a lot of features that do not have the row information availabile. 

In [None]:
#we will fill in or IMPUTE the missing data. Before we do that, we need to pick a suitable method of imputation. 
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Embarked'] = train['Embarked'].fillna(method=('ffill'))
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Embarked'] = test['Embarked'].fillna(method=('ffill'))

In [None]:
train.isna().any()
#Because Cabin column has a lot of missing values, our imputation will not give us good results because it will introduce a lot of biases.
#So it is better to drop the column

In [None]:
#Now we want to find how much correlation these columns have on the Survival of the passenger.
#For this we can plot the data.
#For the categorical data like Sex, we can assign numerical value like 0 or 1
#We can also normalize the data
#most likely Ticket number is not correlated to the Survival of the passengers so we can drop this column as well.
#train.drop('Ticket',axis=1,inplace=True)

In [None]:
#PIVOTING FEATURES
#This should be done after we have taken care of the missing values
#We Pivot the features Pclass, Sex,and SibSp against the Survived to see the correlation
#Generally, this is suited for features that do not take too many values or are not continuous

train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)


In [None]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)


In [None]:
train[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
#For features like age and fare, we may want to use some visual ways to find the correlation
survival_with_age = sns.FacetGrid(train, col='Survived')
survival_with_age.map(plt.hist, 'Age', bins=20)

In [None]:
survival_with_age_in_fixed_PClass = sns.FacetGrid(train, col='Survived', row='Pclass', size=2.5, aspect=1.6)
survival_with_age_in_fixed_PClass.map(plt.hist, 'Age', alpha=1, bins=10)
survival_with_age_in_fixed_PClass.add_legend()
#This shows that most people in PClass = 3 did not survive, and most people in PClass = 1 survived

In [None]:
#We may also find correlation between PClass and port of Embarkment to see if there is a correlation
#This is an example of observing categorical and non-categorical features together.
#Here we have PClass and Survived as non-categorical features, so they can be represented as points in the plot.
#Sex is a categorical feature so it will be plotted as a legend
survival_with_sex_in_pclass = sns.FacetGrid(train, row='Embarked', size=2.2, aspect=1.6)
survival_with_sex_in_pclass.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='bright')
survival_with_sex_in_pclass.add_legend()

In [None]:
#For a non-categorical feature like Fare, for which there are a variety of values, and likely no specific pattern
#We can band together ranges of Fare and extract information based on that
banding_fare = sns.FacetGrid(train, row='Embarked', col='Survived', size=2.2, aspect=1.6)
banding_fare.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=95, capsize=0.1)
banding_fare.add_legend()

In [None]:
#Now we will extract the titles of names as they may be related to survival rate, while names of people may not be
#We'll have to extract the titles from the names in both train and test data
all_data = [train,test]


In [None]:
#crosstab gets frequency count of columns
#Titles are identifird by capital character, small letter, and then a period. 
#This is what will be identified by the str.extract() function
for column in all_data:
    column['Title'] = column.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])
#And now we have the frequency of different titles. We find that some titles are more common than others.
#Master,Mr, Miss, and Mrs are more common than Capt, Col, Don etc.
# We can also Mme and Mlle are titles for Miss in French. So let's combine these under 'Miss'
# The we can put all the less frequesnt titles as 'Rare'

In [None]:
for columns in all_data:#The first line replaces the less frequent titles with 'Rare'
    
    columns['Title'] = columns['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
#the next 3 lines fix the French titles back to English
    columns['Title'] = columns['Title'].replace('Mlle', 'Miss')
    columns['Title'] = columns['Title'].replace('Ms', 'Miss')
    columns['Title'] = columns['Title'].replace('Mme', 'Mrs')
    

In [None]:
#Now we can check the survival rate based on title, only for the training data though
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
#We can see that Miss and Mrs titles had a higher survival rate (probably because females survived more in general)
#Mrs have higher survival rate than Miss, Master had higher survival rate than Mr. 
#Now we can drop the names column
#train = train.drop(['Name'], axis=1)
#train

In [None]:
#Now we can do 2 more things: COnvert the categorical data to numerical by assigning a number to each category
#For example Male and Female can be 0 and 1 respectively
#This can be done on columns 'Sex', 'PClass', and 'Embarked'
for columns in all_data:
    columns['Sex'] = columns['Sex'].map( {'male':0, 'female':1} ).astype(int)


In [None]:
for columns in all_data:
    columns['Embarked'] = columns['Embarked'].map( {'C':1, 'Q':2, 'S':3} ).astype(int)


In [None]:
all_data

In [None]:
#Looking back, the way we completed the Age feature my using the highest mean of Age to fill the NA values, may not be the best way to complete this feature
#A better way would be to see how other features relate to a given Age. For example, If feature Sex=female, Title=Miss, perhaps the age should be less than 18
#But our methodof filling the Age with overall mean Age in the column may have given us incorrect data.
#To implement the method mentioned, refer "https://www.kaggle.com/code/startupsci/titanic-data-science-solutions"
#for dataset in combine:
 #   for i in range(0, 2):
  #      for j in range(0, 3):
   #         guess_df = dataset[(dataset['Sex'] == i) & \
    #                              (dataset['Pclass'] == j+1)]['Age'].dropna()
#
 #           # age_mean = guess_df.mean()
  #          # age_std = guess_df.std()
   #         # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)
#
 #           age_guess = guess_df.median()
#
  #          # Convert random age float to nearest .5 age
 #           guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
 #           
  #  for i in range(0, 2):
   #     for j in range(0, 3):
    #        dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
     #               'Age'] = guess_ages[i,j]
#
 #   dataset['Age'] = dataset['Age'].astype(int)
#
#train_df.head()
#from the above chunk of code, just remove the first # mark.

In [None]:
#Now we will band the ages together as this is a continuous feature
#The number 5 in cut() determines the number of bands that age will be cut into
train['AgeBand'] = pd.cut(train['Age'], 5)
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)


In [None]:
#We can create a new feature called Ageband and store the age information there.
#We will assign the band index to Age and store the corresponding range in the new feature
for columns in all_data:    
    columns.loc[ columns['Age'] <= 16, 'Age'] = 0
    columns.loc[(columns['Age'] > 16) & (columns['Age'] <= 32), 'Age'] = 1
    columns.loc[(columns['Age'] > 32) & (columns['Age'] <= 48), 'Age'] = 2
    columns.loc[(columns['Age'] > 48) & (columns['Age'] <= 64), 'Age'] = 3
    columns.loc[ columns['Age'] > 64, 'Age']
train.head()

In [None]:
#Similarly, we can band the Fare  feature as well and redefine a new feature to store the band information
train['FareBand'] = pd.cut(train['Fare'], 4)
train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)


In [None]:
for columns in all_data:    
    columns.loc[ columns['Fare'] <= 128.082, 'Fare'] = 0
    columns.loc[(columns['Fare'] > 128.082) & (columns['Fare'] <= 256.165), 'Fare'] = 1
    columns.loc[(columns['Fare'] > 256.165) & (columns['Fare'] <= 384.247), 'Fare'] = 2
    columns.loc[columns['Fare'] > 384.247, 'Fare'] = 3
    
all_data

In [None]:
#Now we can drop the features we do not need like Names, Ticket, PassangerID,Ageband, Fareband,Cabin (has lot of missing values) 

In [None]:
train.drop('Cabin',axis=1,inplace=True)
train.drop('Name',axis=1,inplace=True)
train.drop('PassengerId',axis=1,inplace=True)
train.drop('Ticket',axis=1,inplace=True)
train.drop('FareBand',axis=1,inplace=True)
train.drop('AgeBand',axis=1,inplace=True)

In [None]:
train

In [None]:
#What we are trying to achieve here is a dataset with numerical values that are between 0 to 4. The last feature we want to modify is the title.
#It is still categorical so let's make it numerical like we did with 'Embarked' and 'Sex'
#Step 1 is to check the correlation with survival
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
#Step 2 is to reassign
for columns in all_data:
    columns['Title'] = columns['Title'].map( {'Master':0, 'Miss':1, 'Mr':2,'Mrs':3,'Rare':4} ).astype(int)


In [None]:
all_data

In [None]:
#Let's also drop the unwanted features from test data and then see what we have
#Remember that the bands were only created for train data because that was where we had survival rate available

test.drop('Name',axis=1,inplace=True)
test.drop('PassengerId',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)
test.drop('Ticket',axis=1,inplace=True)



In [None]:
test

In [None]:
#The last and final thing we will do is learn how to create a new column by extrapolating data in multiple columns and how it impacts the result (survival rate) 
#The first new feature we will create is FamilySize by combining Parch and SibSp
for columns in train:
    train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# We see that intermediate family size had better survival rate
for columns in test:
    test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [None]:
#Another way to look at this data is to see if the passenger was traveling alone or not

In [None]:
#Let's now compare if Survival depends on Family size
train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)


In [None]:
#Because family size takes values from 0 to 11, we want to have it stay between 0 to 4, like other columns.
#SO we will simply consider whether the person was alone or not and that feature can take values 0 or 1
#So let's create this new feature 

In [None]:
for column in all_data:
    column['IsAlone'] = 0
    column.loc[column['FamilySize'] == 1, 'IsAlone'] = 1

train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

In [None]:
all_data

In [None]:
#Now we can drop the family size, sibsp and parch columns from both test and train data
test.drop('SibSp',axis=1,inplace=True)
test.drop('Parch',axis=1,inplace=True)
test.drop('FamilySize',axis=1,inplace=True)
train.drop('SibSp',axis=1,inplace=True)
train.drop('Parch',axis=1,inplace=True)
train.drop('FamilySize',axis=1,inplace=True)


In [None]:
train

In [None]:
test

In [None]:
#Now that we have clean, normalized training data, we can build a model, train it, and test it out

In [None]:
######Prepare the data#######
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test
X_train.shape, Y_train.shape, X_test.shape

In [None]:
test.isna().any()

In [None]:
test['Fare'] = train['Fare'].fillna(train['Fare'].mean())

In [None]:
############  LINEAR MODEL ---- Logistic Regression Model#########
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, Y_train)
predict = model.predict(X_test)
check_logreg = round(model.score(X_train, Y_train) * 100, 2)
check_logreg

In [None]:
#coef_ contain the coefficients for the prediction of each of the targets. It is also the same as if you trained a model to predict each of the targets separately
#coef_ will give us how much correlation is between each feature and the output prediction i.e. survival
correlation = pd.DataFrame(train.columns.delete(0))
correlation.columns = ['Feature']
correlation["Correlation"] = pd.Series(model.coef_[0])

correlation.sort_values(by='Correlation', ascending=False)


In [None]:
############  LINEAR MODEL ---- Stochastic Gradient Descent Model   ####################
from sklearn.linear_model import SGDClassifier
model = SGDClassifier()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
check_sgd = round(model.score(X_train, Y_train) * 100, 2)
check_sgd

In [None]:
############# LINEAR MODEL : PERCEPTRON ############
from sklearn.linear_model import Perceptron
model = Perceptron()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
check_percp = round(model.score(X_train, Y_train) * 100, 2)
check_percp


In [None]:
############  ENSAMBLE MODEL ---- Random Forest ##################
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=150)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
model.score(X_train, Y_train)
check_rf = round(model.score(X_train, Y_train) * 100, 2)
check_rf

In [None]:
############ Decision Tree #############
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
check_tree = round(model.score(X_train, Y_train) * 100, 2)
check_tree
df_preds = pd.DataFrame({"PassengerIndex":list(range(1,len(Y_pred)+1)),"Label":Y_pred})
df_preds.to_csv("result.csv",index=False,header=True)

In [None]:
######## Support Vector Machine #########
######## This works great for small datasets, where decision is made based on yes or no type of answer######
#Basically, the code tries to draw a plane separating two types of data 
# It tries to get as many points on the right side as possible by optimizing the plane
# The plane can be in a hyper space but it is easy to visualize in 3D space.
#The plane is optimized in such a way that it is far enough from the nearest points on either sides of the plane
# The sum of normals from the nearest point on either sides of the plane to the plane itself is called margin
# The points themselves are called SUPPORTING VECTORS
# It is a supervized learning algorithm because the training data must have the result already in it
# If the data is such that it cannot be easily separated, for example, points on a 2D plane that are somewhat concentric
## then it may be useful to build an additional feature to increase the dimensionality of the space such that the new feature offsets these points 
### in that case, the points can be separated somewhat through a plane normal to that new feature

from sklearn.svm import SVC
model = SVC()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
check_svm = round(model.score(X_train, Y_train) * 100, 2)
check_svm

In [None]:
######### LINEAR SVC ##########
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
check_svc = round(model.score(X_train, Y_train) * 100, 2)
check_svc

In [None]:
############ K-Nearest Neighbor ########
# Calssifies data
#It takes the prediction data row and searches the entire training set to check which are the closest data points
# K means how many nearest neighboring datapoints we are going to compare the test or prediction data to 
# K should be optimized: too low and you will have noise, too high and you'll outvote the category with few data points
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 6)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
check_knn = round(model.score(X_train, Y_train) * 100, 2)
check_knn

In [None]:
#### NAIVE BAYES ########
# The Multinomial Naive Bayes Classifier 
# Another classification problem (yes or no),
# Multiplication of probabilities of features giving us one result versus another
#This method does not work directly if there is a feature with probability = 0
#To make it work in that case, we should add a "black box" to each feature
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train,Y_train)
Y_predict = model.predict(X_test)
check_nb = round(model.score(X_train, Y_train) * 100, 2)
check_nb

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Stochastic Gradient Descent', 'Perceptron', 
              'Random Forest', 'Decision Tree', 'Support Vector Machine', 
              'Linear Support Vector Machine', 'K Nearest Neighbor', 
              'Naive Bayes'],
    'Score': [check_logreg, check_sgd, check_percp, 
              check_rf, check_tree, check_svm, 
              check_svc, check_knn, check_nb]})
models.sort_values(by='Score', ascending=False)

In [None]:
#Decision tree gives the highest score. The result.csv file contains the output predictions from this model