# comparing different models in prediction of titanic problem
## in this jupyter notebook we will compare several models in order to see which one is performing better

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [4]:
# reading csv files

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# to see how data lookes like
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

as the result of this line of code we can understand that we have to ignore cabin, since it has too many missing values but then we have to do something with age and embarked. then since the sex is not a number as well, we have to convert it to numerical value.

In [6]:
train_data['Sex']

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

# Handling missing values


In [7]:
# handling Age value
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())


#handling sex value (applying map function)
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

#handling Embarked value
Embarked_mapping = {'C' : 0, 'Q' : 1, 'S' : 2}
train_data['Embarked'] = train_data['Embarked'].map(Embarked_mapping)
test_data['Embarked'] = test_data['Embarked'].map(Embarked_mapping)

In [8]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_data[features]
y = train_data['Survived']

In [22]:
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    return scores.mean()

In [23]:
# Logistic Regression
#log_reg = LogisticRegression(max_iter=1000)
#log_reg_score = evaluate_model(log_reg, X, y)

# Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree_score = evaluate_model(tree, X, y)

# Random Forest
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest_score = evaluate_model(forest, X, y)

# XGBoost (make sure xgboost is installed)
xgb = XGBClassifier(eval_metric='logloss')
xgb_score = evaluate_model(xgb, X, y)


In [24]:
results = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest', 'XGBoost'],
    'Accuracy': [ tree_score, forest_score, xgb_score]
})

print(results.sort_values(by='Accuracy', ascending=False))


           Model  Accuracy
2        XGBoost  0.813741
1  Random Forest  0.813722
0  Decision Tree  0.780058
