# Apply the classifiers on the titanic dataset

# loading and exploring data

In [1]:
import pandas as pd
file_path = 'gender_submission.csv'
data = pd.read_csv(file_path)

print(data.head())
print(data.info())


   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB
None


# preprocessing of the data

In [9]:
print("Missing values per column:")
print(data.isnull().sum())

data = data.dropna()



Missing values per column:
PassengerId    0
Survived       0
dtype: int64


# splitting the data

In [11]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['Survived']) 
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (334, 1)
Test set size: (84, 1)


# Appling classifiers

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
dec_tree = DecisionTreeClassifier()
rand_forest = RandomForestClassifier()


models = {'Logistic Regression': log_reg, 
          'Decision Tree': dec_tree, 
          'Random Forest': rand_forest}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")


Logistic Regression Accuracy: 0.60
Decision Tree Accuracy: 0.52
Random Forest Accuracy: 0.52


# Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50}
Best cross-validation score: 0.5450022614201718
