You can download the dataset via this link(via Kaggle) [data](https://www.kaggle.com/competitions/titanic/data)

In [50]:
#Import Pandas, numPy
import pandas as pd
import numpy as np

In [51]:
#Getting to know our dataset
test = pd.read_csv("/content/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [52]:
train = pd.read_csv("/content/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
#Checking data types
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [54]:
#Checking data types
train.dtypes.value_counts()

int64      5
object     5
float64    2
dtype: int64

**As we want to create a more generalist model, a column with a lot of cardinality is not interesting (this applies to columns with strings, as obviously talking about numbers, for example: 89,90,91.. they are all distinct, so they have a high cardinality and this does not necessarily hinder our analysis.**

In [55]:
#Checking the amount of different data in each column
train.nunique().sort_values(ascending = False)

PassengerId    891
Name           891
Ticket         681
Fare           248
Cabin          147
Age             88
SibSp            7
Parch            7
Pclass           3
Embarked         3
Survived         2
Sex              2
dtype: int64

**For this reason, we can exclude Name, Ticket and Cabin. Of course, if we delete the columns from the train table, we will also need to delete them from the test table. We can't exclude PassengerId now because the file 'test' just have PassngerId and Survived!**

In [56]:
#Deleting that colums from train
train = train.drop(['Name', 'Ticket', 'Cabin'], axis = 1)

In [57]:
#Deleting that colums from train
test = test.drop(['Name', 'Ticket', 'Cabin'], axis = 1)

In [58]:
# Checking if there is null values in the train dataset
pd.isnull(train).sum().sort_values(ascending = False)

Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Sex              0
SibSp            0
Parch            0
Fare             0
dtype: int64

**It is important to test both, despite being the same variables, there are null values in both datasets, as any null value in the test table other than the train table will cause an error in our evaluation.**

In [59]:
# Checking if there is null values in the test
pd.isnull(test).sum().sort_values(ascending = False)

Age            86
Fare            1
PassengerId     0
Pclass          0
Sex             0
SibSp           0
Parch           0
Embarked        0
dtype: int64

In [60]:
# Now it's time to deal with null values, starting with age
train_age = train.Age.mean()
test_age = test.Age.mean()
train.loc[train.Age.isnull(),'Age'] = train_age
test.loc[test.Age.isnull(),'Age'] = test_age

In [61]:
#Treating null values ​​in the fare column
test_fare = test.Fare.mean()
test.loc[test.Fare.isnull(),'Fare'] = test_fare

In [62]:
#Treating null values ​​in the embarked column
train_embarked = train.Embarked.mode()[0]
train.loc[train.Embarked.isnull(),'Embarked'] = train_embarked

# Let's check that there are no null values in both dataset


In [63]:
#Checking train
train.isnull().sum().sort_values()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [64]:
#Checking test
test.isnull().sum().sort_values()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

# Now is necessary to create a dataset just with numbers!

In [65]:
#By selecting only columns that have variables other than 'object'
columns_train_numbers = train.columns[train.dtypes != 'object']
columns_train_numbers

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [66]:
#I will only work with numbers
train_numbers = train.loc[:,columns_train_numbers]
train_numbers

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.000000,1,0,7.2500
1,2,1,1,38.000000,1,0,71.2833
2,3,1,3,26.000000,0,0,7.9250
3,4,1,1,35.000000,1,0,53.1000
4,5,0,3,35.000000,0,0,8.0500
...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000
887,888,1,1,19.000000,0,0,30.0000
888,889,0,3,29.699118,1,2,23.4500
889,890,1,1,26.000000,0,0,30.0000


In [67]:
#Do the same thing with the test dataset
columns_test_numbers = test.columns[test.dtypes != 'object']
columns_test_numbers

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [68]:
test_numbers = test.loc[:,columns_test_numbers]
test_numbers

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.50000,0,0,7.8292
1,893,3,47.00000,1,0,7.0000
2,894,2,62.00000,0,0,9.6875
3,895,3,27.00000,0,0,8.6625
4,896,3,22.00000,1,1,12.2875
...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500
414,1306,1,39.00000,0,0,108.9000
415,1307,3,38.50000,0,0,7.2500
416,1308,3,30.27259,0,0,8.0500


**Before testing algyns models, we need to separate the training base and the validation base, for this we will use: Before testing any models, we need to separate the training base and the validation base, for this we will use:**

[Train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [69]:
#Import
from sklearn.model_selection import train_test_split

In [70]:
#Separating the train base into X and Y
X = train_numbers.drop(['PassengerId','Survived'], axis = 1)
y = train.Survived

In [71]:
#Separating in train and validation
X_train, X_validation, y_train, y_validation = train_test_split(
X, y, test_size=0.33, random_state=42)

**A metric for evaluating the competition is this:**

**Metric
 Your score is the percentage of passengers you correctly predict. This is known as** #*accuracy*.

**For this reason, we will test the accuracy of:**

[Decision Trees](https://scikit-learn.org/stable/modules/tree.html#classification)

[KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier)

[LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

In [72]:
#Import
from sklearn import tree

In [73]:
#Creating the classifier
clf_dt = tree.DecisionTreeClassifier(random_state = 42)

In [74]:
#Fit with datas
clf_dt = clf_dt.fit(X_train, y_train)

In [75]:
#Making the prediction
y_pred_dt = clf_dt.predict(X_validation)

# Let's do the same job with the KNeighborsClassifier

In [76]:
#Import
from sklearn.neighbors import KNeighborsClassifier

In [77]:
#Creating the classifier
clf_knc = KNeighborsClassifier(n_neighbors=3)

In [78]:
#Fit with datas
clf_knc = clf_knc.fit(X_train, y_train)

In [79]:
#Making the predction
y_pred_knc = clf_knc.predict(X_validation)

# Now with the Logistic Regression

In [80]:
#Import
from sklearn.linear_model import LogisticRegression

In [81]:
#Creating the classifier
clf_lr = LogisticRegression(random_state = 42)

In [82]:
#Fit with datas
clf_lr = clf_lr.fit(X_train, y_train)

In [83]:
#Making the predction
y_pred_lr = clf_lr.predict(X_validation)

**As I said a few cells ago, the metric will be accuracy, so it's time to check the best model to find the best result**


[Accuracy](https://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score)

In [84]:
#Import
from sklearn.metrics import accuracy_score

#Accuracies

In [85]:
#Decision Tree Accuracy
accuracy_score(y_validation, y_pred_dt)

0.6169491525423729

In [86]:
# KNeighborsClassifier accuracy
accuracy_score(y_validation, y_pred_knc)

0.6610169491525424

In [87]:
# Logistic Regression accuracy
accuracy_score(y_validation, y_pred_lr)

0.7254237288135593

# The best model to use is the Logistic Regression!

**We can also use the confusion_matrix to better visualize the distribution of errors and evaluate the accuracy of a classification.**


[Confusion Matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)

In [88]:
#Import
from sklearn.metrics import confusion_matrix

In [89]:
#Decision Tree confusion_matrix
confusion_matrix(y_validation, y_pred_dt)

array([[125,  50],
       [ 63,  57]])

In [90]:
#KNeighborsClassifier confusion_matrix
confusion_matrix(y_validation, y_pred_knc)

array([[133,  42],
       [ 58,  62]])

In [91]:
#Logistic Regression confusion_matrix
confusion_matrix(y_validation, y_pred_lr)

array([[156,  19],
       [ 62,  58]])

In [92]:
#Visualizing data(just numbers)
X_train.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
6,1,54.0,0,0,51.8625
718,3,29.699118,0,0,15.5
685,2,25.0,1,2,41.5792
73,3,26.0,1,0,14.4542
882,3,22.0,0,0,10.5167


In [93]:
test_numbers.head(5)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.5,0,0,7.8292
1,893,3,47.0,1,0,7.0
2,894,2,62.0,0,0,9.6875
3,895,3,27.0,0,0,8.6625
4,896,3,22.0,1,1,12.2875


# Is possible to see that the file X_train doesn't have the colum 'PassengerId'

In [94]:
#Leaving both tables the same
X_test = test_numbers.drop(['PassengerId'], axis = 1)

In [95]:
#Using the Logistic Regression on our test data
y_pred = clf_lr.predict(X_test)

In [96]:
#Creating the colum 'Survived'
test['Survived'] = y_pred

In [97]:
#Selecting just the colums 'PassengerId'and 'Survived' to send
#I Will select the dataset clean, to avoid repetition on next file
result = test[['PassengerId', 'Survived']]

In [98]:
#Exporting to .CSV
result.to_csv('result.csv', index = False)