# Titanic Survival Prediction

## Importing the libraries

In [436]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the training dataset

In [437]:
trainingDF = pd.read_csv("..\\Datasets\\train.csv", encoding = "utf-8", low_memory = False)

print(trainingDF.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


## Importing the test dataset

In [438]:
testingDF = pd.read_csv("..\\Datasets\\test.csv", encoding = "utf-8", low_memory = False)

print(testingDF.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


## Number of NULL values in the training dataset

In [439]:
print(trainingDF.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Number of NULL values in the testing dataset

In [440]:
print(testingDF.isna().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## Storing 'PassengerId' of testing dataset for later using

In [441]:
testingDFPassengerId = testingDF["PassengerId"].values

## Cleaning the training dataset

In [442]:
trainingDF = trainingDF.drop(["PassengerId", "Name", "Ticket", "Fare", "Cabin"], axis = 1)

print(trainingDF)

     Survived  Pclass     Sex   Age  SibSp  Parch Embarked
0           0       3    male  22.0      1      0        S
1           1       1  female  38.0      1      0        C
2           1       3  female  26.0      0      0        S
3           1       1  female  35.0      1      0        S
4           0       3    male  35.0      0      0        S
..        ...     ...     ...   ...    ...    ...      ...
886         0       2    male  27.0      0      0        S
887         1       1  female  19.0      0      0        S
888         0       3  female   NaN      1      2        S
889         1       1    male  26.0      0      0        C
890         0       3    male  32.0      0      0        Q

[891 rows x 7 columns]


## Cleaning the testing dataset

In [443]:
testingDF = testingDF.drop(["PassengerId", "Name", "Ticket", "Fare", "Cabin"], axis = 1)

print(testingDF)

     Pclass     Sex   Age  SibSp  Parch Embarked
0         3    male  34.5      0      0        Q
1         3  female  47.0      1      0        S
2         2    male  62.0      0      0        Q
3         3    male  27.0      0      0        S
4         3  female  22.0      1      1        S
..      ...     ...   ...    ...    ...      ...
413       3    male   NaN      0      0        S
414       1  female  39.0      0      0        C
415       3    male  38.5      0      0        S
416       3    male   NaN      0      0        S
417       3    male   NaN      1      1        C

[418 rows x 6 columns]


## Imputing NULL values in training dataset

In [444]:
from sklearn.impute import SimpleImputer

categoricalImputer = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
numericalImputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

trainingDF["Age"] = numericalImputer.fit_transform(trainingDF["Age"].values.reshape(-1, 1)).flatten()
# trainingDF["Cabin"] = categoricalImputer.fit_transform(trainingDF["Cabin"].values.reshape(-1, 1)).flatten()
trainingDF["Embarked"] = categoricalImputer.fit_transform(trainingDF["Embarked"].values.reshape(-1, 1)).flatten()

## Imputing NULL values in testing dataset

In [445]:
from sklearn.impute import SimpleImputer

categoricalImputer = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
numericalImputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

testingDF["Age"] = numericalImputer.fit_transform(testingDF["Age"].values.reshape(-1, 1)).flatten()
# testingDF["Cabin"] = categoricalImputer.fit_transform(testingDF["Cabin"].values.reshape(-1, 1)).flatten()

## Check for null values in training and testing datasets

In [446]:
print(trainingDF.isna().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64


In [447]:
print(testingDF.isna().sum())

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64


## Splitting the features and target variables from training dataset

In [448]:
x_train = trainingDF.iloc[:, 1:].values
y_train = trainingDF.iloc[:, 0].values

In [449]:
print(x_train)

[[3 'male' 22.0 1 0 'S']
 [1 'female' 38.0 1 0 'C']
 [3 'female' 26.0 0 0 'S']
 ...
 [3 'female' 29.69911764705882 1 2 'S']
 [1 'male' 26.0 0 0 'C']
 [3 'male' 32.0 0 0 'Q']]


In [450]:
print(y_train)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0
 1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0
 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0
 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0
 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1
 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 1 0 1 0 0 1 0 0 0 

## Converting features to training dataset to numpy array

In [451]:
x_test = testingDF.values

In [452]:
print(x_test)

[[3 'male' 34.5 0 0 'Q']
 [3 'female' 47.0 1 0 'S']
 [2 'male' 62.0 0 0 'Q']
 ...
 [3 'male' 38.5 0 0 'S']
 [3 'male' 30.272590361445783 0 0 'S']
 [3 'male' 30.272590361445783 1 1 'C']]


## Label encoding Sex column in training dataset

In [453]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
x_train[:, 1] = le.fit_transform(x_train[:, 1])

In [454]:
print(x_train)

[[3 1 22.0 1 0 'S']
 [1 0 38.0 1 0 'C']
 [3 0 26.0 0 0 'S']
 ...
 [3 0 29.69911764705882 1 2 'S']
 [1 1 26.0 0 0 'C']
 [3 1 32.0 0 0 'Q']]


## Label encoding Sex column in testing dataset

In [455]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
x_test[:, 1] = le.fit_transform(x_test[:, 1])

In [456]:
print(x_test)

[[3 1 34.5 0 0 'Q']
 [3 0 47.0 1 0 'S']
 [2 1 62.0 0 0 'Q']
 ...
 [3 1 38.5 0 0 'S']
 [3 1 30.272590361445783 0 0 'S']
 [3 1 30.272590361445783 1 1 'C']]


## One-Hot Encoding the Cabin and Embarked columns in training dataset

In [457]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [("encoder_embarked", OneHotEncoder(), [5])], remainder = "passthrough")
x_train = np.array(ct.fit_transform(x_train))

# x_train = x_train.toarray()

In [458]:
print(x_train)

[[0.0 0.0 1.0 ... 22.0 1 0]
 [1.0 0.0 0.0 ... 38.0 1 0]
 [0.0 0.0 1.0 ... 26.0 0 0]
 ...
 [0.0 0.0 1.0 ... 29.69911764705882 1 2]
 [1.0 0.0 0.0 ... 26.0 0 0]
 [0.0 1.0 0.0 ... 32.0 0 0]]


## One-Hot Encoding the Cabin and Embarked columns in testing dataset

In [459]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [("encoder_embarked", OneHotEncoder(), [5])], remainder = "passthrough")
x_test = np.array(ct.fit_transform(x_test))

# x_test = x_test.toarray()

In [460]:
print(x_test)

[[0.0 1.0 0.0 ... 34.5 0 0]
 [0.0 0.0 1.0 ... 47.0 1 0]
 [0.0 1.0 0.0 ... 62.0 0 0]
 ...
 [0.0 0.0 1.0 ... 38.5 0 0]
 [0.0 0.0 1.0 ... 30.272590361445783 0 0]
 [1.0 0.0 0.0 ... 30.272590361445783 1 1]]


## Logistic Regression

### Training the Logistic Regression model on the training dataset

In [461]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state = 42)
classifier.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Predicting the testing dataset values

In [462]:
y_pred = classifier.predict(x_test)

In [463]:
print(y_pred)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1
 1 1 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


## Dump result to CSV file

In [467]:
result = np.concatenate((testingDFPassengerId.reshape(len(testingDFPassengerId), 1), y_pred.reshape(len(y_pred), 1)), 1)

print(result)

[[ 892    0]
 [ 893    0]
 [ 894    0]
 [ 895    0]
 [ 896    1]
 [ 897    0]
 [ 898    1]
 [ 899    0]
 [ 900    1]
 [ 901    0]
 [ 902    0]
 [ 903    0]
 [ 904    1]
 [ 905    0]
 [ 906    1]
 [ 907    1]
 [ 908    0]
 [ 909    0]
 [ 910    1]
 [ 911    1]
 [ 912    0]
 [ 913    0]
 [ 914    1]
 [ 915    1]
 [ 916    1]
 [ 917    0]
 [ 918    1]
 [ 919    0]
 [ 920    0]
 [ 921    0]
 [ 922    0]
 [ 923    0]
 [ 924    0]
 [ 925    0]
 [ 926    1]
 [ 927    0]
 [ 928    1]
 [ 929    1]
 [ 930    0]
 [ 931    0]
 [ 932    0]
 [ 933    0]
 [ 934    0]
 [ 935    1]
 [ 936    1]
 [ 937    0]
 [ 938    0]
 [ 939    0]
 [ 940    1]
 [ 941    0]
 [ 942    0]
 [ 943    0]
 [ 944    1]
 [ 945    1]
 [ 946    0]
 [ 947    0]
 [ 948    0]
 [ 949    0]
 [ 950    0]
 [ 951    1]
 [ 952    0]
 [ 953    0]
 [ 954    0]
 [ 955    1]
 [ 956    1]
 [ 957    1]
 [ 958    1]
 [ 959    0]
 [ 960    1]
 [ 961    1]
 [ 962    1]
 [ 963    0]
 [ 964    1]
 [ 965    1]
 [ 966    1]
 [ 967    1]
 [ 968    0]

In [468]:
resultDF = pd.DataFrame(result, columns = ["PassengerId", "Survived"])

In [476]:
print(resultDF)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [478]:
resultDF.to_csv("..\\Datasets\\survived_solution.csv", index = False)