In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

from preprocessing import prepare_dataset


# Reading the dataset

In [2]:
data = pd.read_csv('train.csv', index_col='PassengerId')
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Evaluation function

In [3]:
def evaluate_dataset(dataset: pd.DataFrame, random_state=None):
    X = dataset.drop('Survived', axis=1)
    y = dataset['Survived']

    models = [
        ('SVC', SVC(random_state=random_state)),
        ('Perceptron', Perceptron(random_state=random_state)),
        ('Random Forest', RandomForestClassifier(random_state=random_state, max_depth=5))
    ]

    for name, model in models:
        cv = StratifiedKFold(5, random_state=random_state, shuffle=True)
        scores = cross_val_score(model, X, y, cv=cv)
        rounded_scores = [round(score, 2) for score in scores]

        print(f'{name}')
        print(f'Scores: {rounded_scores}')
        print(f'Mean score: {round(sum(scores) / len(scores), 2)}')
        print()

# Testing datasets

## Unprepared dataset

In [4]:
unprepared = data[['Pclass', 'Age', 'Fare', 'Sex', 'Survived']].copy()
unprepared['Sex'] = unprepared['Sex'].apply(lambda x: 1.0 if x == 'male' else 2.0)
unprepared = unprepared.fillna(0)

evaluate_dataset(unprepared, random_state=17)

SVC
Scores: [0.68, 0.69, 0.67, 0.67, 0.7]
Mean score: 0.68

Perceptron
Scores: [0.73, 0.71, 0.66, 0.57, 0.61]
Mean score: 0.66

Random Forest
Scores: [0.84, 0.8, 0.75, 0.83, 0.82]
Mean score: 0.81



## Prepared dataset

In [5]:
improved_dataset = prepare_dataset(data)
improved_dataset.head()

Unnamed: 0_level_0,Survived,Pclass,Age,Fare,Deck,FamilySize,Alone,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3,-0.565736,-0.502445,8,1,False,False,False,True,False,False,True,False,False,False,True
2,1,1,0.663861,0.786845,2,1,False,True,False,False,False,False,False,True,False,True,False
3,1,3,-0.258337,-0.488854,8,0,True,False,False,True,False,True,False,False,False,True,False
4,1,1,0.433312,0.42073,2,1,False,False,False,True,False,False,False,True,False,True,False
5,0,3,0.433312,-0.486337,8,0,True,False,False,True,False,False,True,False,False,False,True


In [6]:
evaluate_dataset(improved_dataset, random_state=17)

SVC
Scores: [0.83, 0.8, 0.8, 0.84, 0.84]
Mean score: 0.82

Perceptron
Scores: [0.82, 0.8, 0.7, 0.78, 0.87]
Mean score: 0.79

Random Forest
Scores: [0.84, 0.81, 0.8, 0.85, 0.83]
Mean score: 0.83

