In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import all the Required Libraries. 

# **Reading Data**

Above code cell by default attached in notebook for preliminary setup.so the very first step to read data(train and test data both).for this purpose we use pandas library.

In [None]:
# Load the train and test data 

df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

To get a overview about the data by viewing the first 5 rows. Below line of code will result first five row.

In [None]:
# first five data rows 

df_train.head()

below code will give you the statistics description of data i.e. mean,medain,mode,deviation etc.

In [None]:
# To get a statistical description of Data 

df_train.describe()

In [None]:
# Get details like column name, datatype , null or not null ,memory usage

df_train.info()

# Missing Value Analysis- Training Data

**Count of Missing Values Before Analysis**

In [None]:
# Check which feature/column has how many null values

df_train.isnull().sum()

In [None]:
# percentage missing values in cabin and age feature 
# count of missing in cabin column divided by total rows count

per_age = (df_train['Age'].isnull().sum())/(df_train.shape[0])*100
per_cabin = (df_train['Cabin'].isnull().sum())/(df_train.shape[0])*100
print("Percentage missing in Age : ",per_age)
print("Percentage Missing in Cabin : ",per_cabin)

**Drop Missing Values or if count is more then remove that feature column Itself**

In [None]:
# take a deep copy of data using copy function
# Deep copy are those in which original data is not changed if something changed in copy

df_train = df_train.copy()

# Then drop Cabin Column as it has more than 77% missing values ,axis=1 means drop column not row 

df_train.drop('Cabin',axis=1,inplace=True)

In [None]:
# Get categorywise count, means whatever unique values emabarked feature have, will get count of those

df_train['Embarked'].value_counts()

In [None]:
# Fill 'S' in place of missing values
# Why 'S'?? - Because As per above output 'S' has too more count than 'C' and 'Q'.
# if these are numerical data either we can put average values in place.
# But here we have categorical, so we can't take average

df_train['Embarked'].fillna('S',inplace=True)

In [None]:
import seaborn as sns  # library using for plots
import matplotlib.pyplot as plt
sns.set_style('whitegrid') # To make a background as a grid with white color
plt.figure(figsize=(13,6))

# plot a historgram with 80 bins so that we can visualize the count for each age(from 1 to 80)
sns.histplot(df_train['Age'],kde=False,bins=80,color='red')

In [None]:
# This is a numerical data so that either we can fill missing values with mean or median of Age col 
# Here we are filling median in place of missing values 

df_train['Age'].fillna(df_train['Age'].median(),inplace=True)

**Count of Missing Values After Analysis**

In [None]:
df_train.isnull().sum()
# Count of missing values in all columns is now zero.

**Create New Feature TravelAlone**

* SibSp - It is number of siblings/spouses with passenger
* Parch - It is number of parents/children with passenger

* Just to reduce number of features,we can make single feature which is TravelAlone 
* Add value of SibSp and Parch column 
* If value is is more than 0 ,Put 1 in corresponding cell of TravelAlone
* If value is not more than zero ,Put 0 in Corresponding cell of TravelAlone 


In [None]:
# Create new feature TravelAlone
df_train['TravelAlone'] = np.where((df_train['SibSp']+df_train['Parch']) > 0, 0,1)

# As we have created TravelAlone, drop both columns SibSp and Parch
df_train.drop('SibSp',axis=1,inplace=True)
df_train.drop('Parch',axis=1,inplace=True)

**Dummy Encoding :** It is a concept of changing categroical variable to numeric variable. As an example ,if we have Sex column which has values 'M' for Male and 'F' for female. We can not train or process M and F characters with other numrical data.We need to convert it into numerical values.For this conversion we use Dummy Encoding Technique. 

* It will make Two columns for Sex feature - one is Sex_male other is Sex_female.
* Algorithm will put 1 in sex_male column if value is 'M' in Sex column and 0 if 'F' in Sex column.
* Vice-versa process will follow for Sex-female column as well.

In [None]:
# Encode 'Pclass','Sex','Embarked' features using dummy encoding technique.

train_data = pd.get_dummies(df_train,columns=['Pclass','Sex','Embarked'])

# Drop PassengerID , Name , Ticket as these features are irrelevant for training purpose.
# There is no use for training as these are unique information of a passenger
train_data.drop('PassengerId',axis=1,inplace=True)
train_data.drop('Name',axis=1,inplace=True)
train_data.drop('Ticket',axis=1,inplace=True)

# Sex_female column we got after encoding , as sex has only two categories - Male or female
# we can remove one of them. if 'sex_male'(which is still in training data) has value 0 it 
# means This is a female passenger ,if value the This is a Male passenger.
train_data.drop('Sex_female',axis=1,inplace=True)

final_data = train_data
final_data

In [None]:
final_data.head()

In [None]:
train_features = ['Age','Fare','TravelAlone','Pclass_1','Pclass_2','Pclass_3','Sex_male','Embarked_C','Embarked_Q','Embarked_S']
X = final_data[train_features]
Y = final_data['Survived']

# Missing Data Analysis of Test Data 

This Section Just a recap of above step thet we have performed.The only difference is that here we are going to perform analysis on Test Data and previously we preprocess the training data.Just to get a quick revision of preprocessing step on test Data.

In [None]:
df_test.head()

In [None]:
df_test.isnull().sum()

In [None]:
df_test.drop('Cabin',axis=1,inplace=True)
df_test['Age'].fillna(df_test['Age'].median(),inplace=True)


In [None]:
df_test['Fare'].fillna(df_test['Fare'].mean(),inplace=True)

In [None]:
df_test.isnull().sum()

In [None]:
df_test['TravelAlone'] = np.where((df_test['SibSp']+df_test['Parch']) > 0, 0,1)
df_test.drop('SibSp',axis=1,inplace=True)
df_test.drop('Parch',axis=1,inplace=True)

In [None]:
test_data = pd.get_dummies(df_test,columns=['Pclass','Sex','Embarked'])
test_data.drop('PassengerId',axis=1,inplace=True)
test_data.drop('Name',axis=1,inplace=True)
test_data.drop('Ticket',axis=1,inplace=True)
test_data.drop('Sex_female',axis=1,inplace=True)


In [None]:
test_data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

In [None]:
reg_log = LogisticRegression(solver='lbfgs', max_iter=1000)
reg_log.fit(x_train,y_train)
log_pred = reg_log.predict(x_test)
accuracy_score(y_test,log_pred)

In [None]:
reg_svc = LinearSVC()
reg_svc.fit(x_train,y_train)
svm_pred = reg_svc.predict(x_test)
accuracy_score(y_test,svm_pred)

In [None]:
reg_rand = RandomForestClassifier(n_estimators=100)
reg_rand.fit(x_train,y_train)
rf_pred = reg_rand.predict(x_test)
accuracy_score(rf_pred,y_test)

In [None]:
reg_KNN = KNeighborsClassifier(n_neighbors=9,weights='distance')
reg_KNN.fit(x_train,y_train)
knn_pred = reg_KNN.predict(x_test)
accuracy_score(y_test,knn_pred)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(11,11,11), activation='relu', solver='adam', max_iter=500)
mlp.fit(x_train,y_train)

mlp_pred = mlp.predict(x_test)
accuracy_score(y_test,mlp_pred)

In [None]:
final_pred = reg_rand.predict(test_data)


In [None]:
output = pd.DataFrame({
    "PassengerId" : df_test['PassengerId'],
    "Survived" : final_pred
})
output.to_csv('submission.csv',index=False)