<a href="https://colab.research.google.com/github/Lochipi/PredictingtheSurvivalofTitanicPassengers/blob/main/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# for machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# loading dataset and reading it

df = pd.read_csv('titanic.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
len(df)
print("Printing the number of rows:",len(df))

Printing the number of rows: 891


In [4]:
# print names of columns
print(df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [5]:
# print column datatypes
print(df.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [6]:
# print a description of the data
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# Print the first 10 columns
print(df.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

In [8]:
# Check for null values in columns
# print(df.isna())
df.isna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [9]:
# getting the sum of null values in each column
print(df.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [10]:
# Dropping Cabin column

df.drop('Cabin', axis=1, inplace=True)

In [11]:
df.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'], dtype=object)

In [12]:
# Replacing null values with the mean age of the data
# the mean is:
# df['Age'].mean() =calculating mean

df['Age'].fillna(df['Age'].mean(), inplace=True)

In [13]:
# getting boolean values for each column
pd.get_dummies(df['Sex'])

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [14]:
# one value would be enough, dropping female column
pd.get_dummies(df['Sex'],drop_first=True)

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [15]:
# Add the Gender column
df['Gender']=pd.get_dummies(df['Sex'],drop_first=True)

# Verifying the new Gender column
print(df.columns.values,df['Gender'])

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Embarked' 'Gender'] 0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Gender, Length: 891, dtype: uint8


In [16]:
# Dropping all the non-integer columns that are not useful to the model.
df.drop(['Sex','Name','Embarked','Ticket'],axis=1,inplace=True)

In [17]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Gender
0,1,0,3,22.0,1,0,7.25,1
1,2,1,1,38.0,1,0,71.2833,0
2,3,1,3,26.0,0,0,7.925,0
3,4,1,1,35.0,1,0,53.1,0
4,5,0,3,35.0,0,0,8.05,1


In [18]:
# Dropping all non interger column that are not useful in our model

df.drop(['Sex','Name','Embarked','Ticket'],axis=1,inplace=True)

KeyError: ignored

In [19]:
# the columns are dropped.
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Gender'],
      dtype='object')

In [20]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Gender
0,1,0,3,22.0,1,0,7.25,1
1,2,1,1,38.0,1,0,71.2833,0
2,3,1,3,26.0,0,0,7.925,0
3,4,1,1,35.0,1,0,53.1,0
4,5,0,3,35.0,0,0,8.05,1


#sepatating the dependent and dependent variable


In [22]:
X = df[['Pclass','Age','SibSp','Parch' ,'Fare','Gender']]  #independent variables
y = df['Survived'] #dependent variable

This code below is splitting a dataset into training and testing sets using the `train_test_split` function from the Scikit-learn library in Python.

The dataset is divided into four variables: `x_train`, `x_test`, `y_train`, and `y_test`.

- `X` represents the input features or independent variables of the dataset.
- `y` represents the dependent variable or output.

The `test_size` parameter sets the proportion of the dataset that will be used as the testing set. In this case, 33% of the data set will be used for testing.

The `random_state` parameter is used to ensure that the same random data points are selected every time the code is run. This is important for reproducibility.

So, this code is creating two sets of data, one for training the model (`x_train` and `y_train`) and one for testing the model (`x_test` and `y_test`). The model will be trained on the training set and then tested on the testing set to evaluate its performance. 

#Data splitting

splitting the dataset into `training` and `testing` datasets.

In [23]:
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size = 0.33, random_state= 42)

#Training model
We train the model using `LogisticRegression` from the `sklearn.linear_models`

In [25]:
#model training
Model = LogisticRegression()
Model.fit(x_train, y_train)

#Predictions
We give the model the testing data `x-test` for it to make predictions on who survived the titanic, this predictions are stored in the variable `predict`

In [26]:
predict = Model.predict(x_test)

#Testing the models performance

We can test the model's performance by using a `confusion_matrix` which outputs a matrix with the values of `true positive, false positive ,true negative and true negative.`

In [27]:
confusion_matrix(y_test,predict)

array([[156,  19],
       [ 34,  86]])

converting to a data frame that can be easily read

In [28]:
pd.DataFrame(confusion_matrix(y_test,predict),columns=['Predicted did not survive','Predicted survived'],index=['Actually did not survive','Actually survived'])

Unnamed: 0,Predicted did not survive,Predicted survived
Actually did not survive,156,19
Actually survived,34,86


Let's generate a `classification_report` which will show the models accuracy according to its precision

In [29]:
# Generating the classification report

print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85       175
           1       0.82      0.72      0.76       120

    accuracy                           0.82       295
   macro avg       0.82      0.80      0.81       295
weighted avg       0.82      0.82      0.82       295



In [31]:
#To improve the models performance you can use more features by including the columns we dropped or using a different model.
#Tadaa! 