# Libraries and data imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler



# Data manipulation

## Loading and analizing the data

In [3]:
df_original = pd.read_csv("/kaggle/input/titanic/train.csv")
df_original.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Unique values

In [4]:
print("Pclass: ",df_original["Pclass"].unique())
print("Sex: ",df_original["Sex"].unique())
print("SibSp: ",df_original["SibSp"].unique())
print("Parch: ",df_original["Parch"].unique())
print("SibSp: ",df_original["SibSp"].unique())
print("Parch: ",df_original["Parch"].unique())
print("Embarked: ",df_original["Embarked"].unique())
print("Total of rows", df_original.shape[0])
#To much info to be useful
#print("Ticket: ",df_original["Ticket"].unique())
#print("Cabin: ",df_original["Cabin"].unique())
#
df_original.info()

Pclass:  [3 1 2]
Sex:  ['male' 'female']
SibSp:  [1 0 3 4 2 5 8]
Parch:  [0 1 2 5 3 4 6]
SibSp:  [1 0 3 4 2 5 8]
Parch:  [0 1 2 5 3 4 6]
Embarked:  ['S' 'C' 'Q' nan]
Total of rows 891
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


As we can notice the majorite of the data are complete, this maybe be a problem when training the model since the age are one of the most importants factors to let someone use a lifeboat, 

## Removing the usuless information

In [5]:
cols_to_use = ['Survived', 'Sex','Age','Pclass']
df = df_original[cols_to_use]
df.head()

Unnamed: 0,Survived,Sex,Age,Pclass
0,0,male,22.0,3
1,1,female,38.0,1
2,1,female,26.0,3
3,1,female,35.0,1
4,0,male,35.0,3


## Creating new categories

## Changing the sex for a boolean

In [7]:
df['Sex_binary'] = (df['Sex'] == "male").astype(int)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sex_binary'] = (df['Sex'] == "male").astype(int)


Unnamed: 0,Survived,Sex,Age,Pclass,Sex_binary
0,0,male,22.0,3,1
1,1,female,38.0,1,0
2,1,female,26.0,3,0
3,1,female,35.0,1,0
4,0,male,35.0,3,1


## First, Second, Third

In [9]:
df['1st_class'] = (df['Pclass'] == 1).astype(int)
df['2nd_class'] = (df['Pclass'] == 2).astype(int)
df['3rd_class'] = (df['Pclass'] == 3).astype(int)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['1st_class'] = (df['Pclass'] == 1).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['2nd_class'] = (df['Pclass'] == 2).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['3rd_class'] = (df['Pclass'] == 3).astype(int)


Unnamed: 0,Survived,Sex,Age,Pclass,Sex_binary,1st_class,2nd_class,3rd_class
0,0,male,22.0,3,1,0,0,1
1,1,female,38.0,1,0,1,0,0
2,1,female,26.0,3,0,0,0,1
3,1,female,35.0,1,0,1,0,0
4,0,male,35.0,3,1,0,0,1


### Age (baby, child, teen, adult)

In [14]:
df['is_baby'] = (df["Age"] <= 6).astype(int)
df['is_child'] = ((df["Age"] > 6) & ( df["Age"] <= 13)).astype(int)
df['is_teen'] = ((df["Age"] >13) & (df["Age"] <=17)).astype(int)
df['is_adult'] = (df["Age"] >17).astype(int)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_baby'] = (df["Age"] <= 6).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_child'] = ((df["Age"] > 6) & ( df["Age"] <= 13)).astype(int)


Unnamed: 0,Survived,Sex,Age,Pclass,Sex_binary,1st_class,2nd_class,3rd_class,is_baby,is_adult,is_child,is_teen
0,0,male,22.0,3,1,0,0,1,0,1,0,0
1,1,female,38.0,1,0,1,0,0,0,1,0,0
2,1,female,26.0,3,0,0,0,1,0,1,0,0
3,1,female,35.0,1,0,1,0,0,0,1,0,0
4,0,male,35.0,3,1,0,0,1,0,1,0,0


# Training

In [20]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])
print("Train shape: ", train.shape)
print("valid shape: ", valid.shape)
print("test shape: ", test.shape)

Train shape:  (534, 12)
valid shape:  (178, 12)
test shape:  (179, 12)


In [30]:
cols_to_train = ["Sex_binary","1st_class",
                 "2nd_class","3rd_class",
                 "is_baby","is_adult",
                 "is_child","is_teen"]

col_results = ["Survived"]

df_training_features = train[cols_to_train]
df_training_results = train[col_results]

df_test_features = test[cols_to_train]
df_test_results = test[col_results]

## KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [39]:
knn_model = KNeighborsClassifier(n_neighbors=4)
knn_model.fit(df_training_features,df_training_results )

  return self._fit(X, y)


In [36]:
knn_prediction = knn_model.predict(df_test_features)

In [40]:
print(classification_report(df_test_results, knn_prediction))

              precision    recall  f1-score   support

           0       0.77      0.89      0.83       114
           1       0.74      0.52      0.61        65

    accuracy                           0.76       179
   macro avg       0.75      0.71      0.72       179
weighted avg       0.76      0.76      0.75       179



 ## Naive Bayes 

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
nb_model = GaussianNB()
nb_model = nb_model.fit(df_training_features, df_training_results)

  y = column_or_1d(y, warn=True)


In [43]:
pred_nb = nb_model.predict(df_test_features)

In [44]:
print(classification_report(df_test_results, pred_nb))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87       114
           1       0.90      0.54      0.67        65

    accuracy                           0.81       179
   macro avg       0.84      0.75      0.77       179
weighted avg       0.83      0.81      0.80       179



## Log regression

In [45]:
from sklearn.linear_model import LogisticRegression

In [47]:
lr_model = LogisticRegression()
lr_model = lr_model.fit(df_training_features, df_training_results)

  y = column_or_1d(y, warn=True)


In [48]:
y_pred_lr = lr_model.predict(df_test_features)
print(classification_report(df_test_results, y_pred_lr))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84       114
           1       0.75      0.60      0.67        65

    accuracy                           0.78       179
   macro avg       0.77      0.74      0.75       179
weighted avg       0.78      0.78      0.78       179



## Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
forest = RandomForestClassifier(n_estimators=100, random_state=100)
forest.fit(df_training_features, df_training_results)

  forest.fit(df_training_features, df_training_results)


In [52]:
forest_predict = forest.predict(df_test_features)
print(classification_report(df_test_results, forest_predict))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86       114
           1       0.89      0.52      0.66        65

    accuracy                           0.80       179
   macro avg       0.84      0.74      0.76       179
weighted avg       0.82      0.80      0.79       179



# Result

The model that achived the best accuracy was the Naive Bayes

## Loading the test csv

In [69]:
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_test.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Cleaning the data

In [71]:
cols_to_use = [ 'Sex','Age','Pclass']
df_result = df_test[cols_to_use]
df_result.head()

Unnamed: 0,Sex,Age,Pclass
0,male,34.5,3
1,female,47.0,3
2,male,62.0,2
3,male,27.0,3
4,female,22.0,3


In [75]:
df_result['is_baby'] = (df_result["Age"] <= 6).astype(int)
df_result['is_child'] = ((df_result["Age"] > 6) & ( df_result["Age"] <= 13)).astype(int)
df_result['is_teen'] = ((df_result["Age"] >13) & (df_result["Age"] <=17)).astype(int)
df_result['is_adult'] = (df_result["Age"] >17).astype(int)
df_result['Sex_binary'] = (df_result['Sex'] == "male").astype(int)
df_result['1st_class'] = (df_result['Pclass'] == 1).astype(int)
df_result['2nd_class'] = (df_result['Pclass'] == 2).astype(int)
df_result['3rd_class'] = (df_result['Pclass'] == 3).astype(int)
df_result.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['is_baby'] = (df_result["Age"] <= 6).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['is_child'] = ((df_result["Age"] > 6) & ( df_result["Age"] <= 13)).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['is_teen'] = ((df_result["Age"] >13) & (df_resu

Unnamed: 0,Sex,Age,Pclass,is_baby,is_child,is_teen,is_adult,Sex_binary,1st_class,2nd_class,3rd_class
0,male,34.5,3,0,0,0,1,1,0,0,1
1,female,47.0,3,0,0,0,1,0,0,0,1
2,male,62.0,2,0,0,0,1,1,0,1,0
3,male,27.0,3,0,0,0,1,1,0,0,1
4,female,22.0,3,0,0,0,1,0,0,0,1


In [78]:
df_result_predict = df_result[cols_to_train]
df_result_predict.head()

Unnamed: 0,Sex_binary,1st_class,2nd_class,3rd_class,is_baby,is_adult,is_child,is_teen
0,1,0,0,1,0,1,0,0
1,0,0,0,1,0,1,0,0
2,1,0,1,0,0,1,0,0
3,1,0,0,1,0,1,0,0
4,0,0,0,1,0,1,0,0


## Predicting

In [87]:
pred_result = nb_model.predict(df_result_predict)
print(pred_result)

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0
 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0
 0 0 1 0 1 0 0 1 0 0 0]


## Exporting the result

In [89]:

Survived = pd.Series(pred_result, index=df_test['PassengerId'], name="Survived")
Survived.head()




PassengerId
892    0
893    0
894    0
895    0
896    0
Name: Survived, dtype: int64

In [86]:
Survived.to_csv("Second_model.csv", header=True)