In [202]:
#Importing the required Libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [203]:
#Reading the train and test datasets.
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')

In [204]:
#Checking the rows and columns of the dataframe
df_train.shape

(891, 12)

In [205]:
#Checking the null values in the dataset
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [206]:
#Now as we can see in the above cell we have missing values in the 3 columns namely Age,Cabin,Embarked .
#The Age variable is a continuos variable and we can replace the missing values by mean .

In [207]:
df_train['Age'].fillna(df_train['Age'].mean(),inplace=True)

In [208]:
df_train['Age'].isnull().sum()

0

In [209]:
#Now we will check for the categorical variables i.e Cabin and Embarked


In [210]:
df_train['Cabin'].nunique()

147

In [211]:
#Our Dataframe contains 891 rows in total and the Cabin column has 687 missing values and 147 unique values.
#So it will be in our best intrest to drop this column as it will create lot of uncertainity in the dataset.

In [212]:
df_train.drop(['Cabin'],axis=1,inplace=True)

In [213]:
#Using axis=1 so as to specify that we are deleting a column and inplace=True so it affects our original dataset.
#Now we will try and fix the Embarked column .


In [214]:
df_train['Embarked'].nunique()

3

In [215]:
#The embarked column has only 2 null values and 3 unique so we can use the get_dummies functionn for this column.

In [216]:
Embarked_Dummies=pd.get_dummies(df_train['Embarked'])
#we will always take n-1 columns when we create dummy variables . 

In [217]:
df_train=pd.concat([df_train,Embarked_Dummies],axis=1)

In [218]:
df_train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,0,0


In [219]:
df_train.drop(['Embarked'],axis=1,inplace=True)

In [220]:
#Dropping the name column as it has all 891 unique values.
df_train.drop(['Name'],axis=1,inplace=True)

In [221]:

#Now upon examining the dataset , we find that all null values have been dealt with but the data still has categorical variables 
#So we will quickly convert those into their dummy values
df_train['Ticket'].nunique()

681

In [222]:
df_train.drop(['Ticket'],axis=1,inplace=True)

In [223]:
df_train=pd.get_dummies(df_train,columns=['Sex'])

In [224]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex_female,Sex_male
0,1,0,3,22.0,1,0,7.25,0,0,1,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,0,1,0
2,3,1,3,26.0,0,0,7.925,0,0,1,1,0
3,4,1,1,35.0,1,0,53.1,0,0,1,1,0
4,5,0,3,35.0,0,0,8.05,0,0,1,0,1


In [225]:
#Now we can see all the categorical variables have been dealt with and our dataset has only numerical values.
#Now we will scale our dataset to bring all values under o and 1 and then start the model building process.

In [226]:
from sklearn.preprocessing import MinMaxScaler

In [227]:
scaler=MinMaxScaler()

In [228]:
num_vars=['Age','Fare']
df_train[num_vars]=scaler.fit_transform(df_train[num_vars])

In [229]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex_female,Sex_male
0,1,0,3,0.271174,1,0,0.014151,0,0,1,0,1
1,2,1,1,0.472229,1,0,0.139136,1,0,0,1,0
2,3,1,3,0.321438,0,0,0.015469,0,0,1,1,0
3,4,1,1,0.434531,1,0,0.103644,0,0,1,1,0
4,5,0,3,0.434531,0,0,0.015713,0,0,1,0,1


In [230]:
train_x=df_train.drop(['Survived'],axis=1)
train_y=df_train['Survived']

In [231]:
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.metrics import f1_score

In [232]:
logreg=LogReg()

In [233]:
logreg.fit(train_x,train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Making predictions using the predict function.

In [234]:
train_predict=logreg.predict(train_x)

In [235]:
k=f1_score(train_predict,train_y)
print("Training f1_score",k)

Training f1_score 0.7317073170731707


# Making predictions using the predict_proba function

In [236]:
#Predicting over the training dataset
train_predict=logreg.predict_proba(train_x)
train_predict

array([[0.91277102, 0.08722898],
       [0.09063388, 0.90936612],
       [0.38938488, 0.61061512],
       ...,
       [0.47256565, 0.52743435],
       [0.41327219, 0.58672781],
       [0.87165118, 0.12834882]])

In [237]:
train_preds=train_predict[:,1]
train_preds

array([0.08722898, 0.90936612, 0.61061512, 0.87353449, 0.08262675,
       0.11882832, 0.28689832, 0.08076567, 0.57127271, 0.87041105,
       0.6639418 , 0.83001966, 0.11671861, 0.04290181, 0.68078611,
       0.66522948, 0.08145005, 0.21403555, 0.51685774, 0.68684384,
       0.19477362, 0.19629337, 0.73133849, 0.43396734, 0.51160735,
       0.38959695, 0.13683702, 0.32978183, 0.65202002, 0.09378929,
       0.46209347, 0.93180299, 0.65213515, 0.09747791, 0.48776592,
       0.29523755, 0.13703447, 0.11454044, 0.53286978, 0.71571077,
       0.45796588, 0.75768175, 0.13724962, 0.8871727 , 0.71157487,
       0.09403216, 0.09463989, 0.65270167, 0.08621805, 0.59950792,
       0.05735409, 0.11474594, 0.88503423, 0.74973939, 0.30712   ,
       0.42467267, 0.82528306, 0.14110898, 0.82764526, 0.03796922,
       0.16245292, 0.87151538, 0.28837022, 0.07334738, 0.52898101,
       0.10277278, 0.79421251, 0.12035001, 0.369842  , 0.06257681,
       0.20528167, 0.32536774, 0.2697743 , 0.11909091, 0.09452

In [238]:
for i in range(0,len(train_preds)):
    if(train_preds[i]>0.6):
        train_preds[i]=1
    else:
        train_preds[i]=0

In [239]:
#Now in the above command ,we have segregated the train_preds values .
#If any value >0.6 it is classified as 1 else 0.

In [240]:
f1=f1_score(train_preds,train_y)
print('Training f1 score',f1)

Training f1 score 0.7245409015025043


In [241]:
#Now as we have calculated the f1_score on our training dataset,its time we get down with our test datset.
df_test.head(5)
#We will follow the same sequence of steps that we did with training data to ensure that the test dataset has the same number of columns.

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [242]:
#Checking the rows and columns of the test dataframe.
df_test.shape

(418, 11)

In [243]:
#Now we can see that our training dataset had 891 rows and 12 columns but our test dataset has only 11 columns.
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [244]:
df_test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [245]:
df_test['Age'].fillna(df_test['Age'].mean(),inplace=True)

In [246]:
df_test['Survived']=''

In [247]:
df_test.drop(['Survived'],axis = 1, inplace = True)

In [248]:
df_test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [249]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [250]:
df_test.shape

(418, 11)

In [165]:
#Now we can see that we have added the new column and now we have same number of rows and columns in the train and test data.
#So we should quickly process our test data set and get started with model building.

In [251]:
df_test['Age'].isnull().sum()

0

In [252]:
#Now we will check for categorical varibles i.e cabin and embarked

In [253]:
df_test['Cabin'].nunique()

76

In [254]:
#Our test dataframe has 491 rows and the cabin column hs 370 null values and 76 unique values
#So it will be in our best intrest that we drop this column as it creates a lot of uncertainity in the dataset.

In [255]:
df_test.drop(['Cabin'],axis=1,inplace=True)

In [256]:
df_test['Embarked'].nunique()

3

In [257]:
Embarked_DummiesTest=pd.get_dummies(df_test['Embarked'])

In [258]:
df_test=pd.concat([df_test,Embarked_DummiesTest],axis=1)

In [259]:
df_test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,C,Q,S
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,0,0,1


In [260]:
df_test.drop(['Embarked'],axis=1,inplace=True)

In [261]:
#Dropping the Name column as 491 unique names.
df_test.drop(['Name'],axis=1,inplace=True)

In [262]:

#Now upon examining the dataset , we find that all null values have been dealt with but the data still has categorical variables 
#So we will quickly convert those into their dummy values
df_test['Ticket'].nunique()

363

In [263]:
#Now as the Ticket column has 363 unique values out of 491 rows .
#So we will delete the column altogether.
df_test.drop(['Ticket'],axis=1,inplace=True)

In [264]:
df_test=pd.get_dummies(df_test,columns=['Sex'])

In [265]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex_female,Sex_male
0,892,3,34.5,0,0,7.8292,0,1,0,0,1
1,893,3,47.0,1,0,7.0,0,0,1,1,0
2,894,2,62.0,0,0,9.6875,0,1,0,0,1
3,895,3,27.0,0,0,8.6625,0,0,1,0,1
4,896,3,22.0,1,1,12.2875,0,0,1,1,0


In [266]:
df_test['Fare'].isnull().sum()

1

In [267]:
df_test['Fare'].fillna(df_test['Fare'].mean(),inplace=True)

In [268]:
num_vars1=['Age','Fare','PassengerId']
df_test[num_vars1]=scaler.fit_transform(df_test[num_vars1])

In [269]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex_female,Sex_male
0,0.0,3,0.452723,0,0,0.015282,0,1,0,0,1
1,0.002398,3,0.617566,1,0,0.013663,0,0,1,1,0
2,0.004796,2,0.815377,0,0,0.018909,0,1,0,0,1
3,0.007194,3,0.353818,0,0,0.016908,0,0,1,0,1
4,0.009592,3,0.287881,1,1,0.023984,0,0,1,1,0


In [271]:
test_predict=logreg.predict(df_test)


In [101]:
"""
train_predict=logreg.predict(train_x)
k=f1_score(train_predict,train_y)
print("Training f1_score",k)
""""




In [272]:
test_predict

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,