In [308]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score
from scipy.stats import randint


Load the training dataset

In [277]:
tr = pd.read_csv('train.csv')
print(tr)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

Load the test dataset

In [278]:
te = pd.read_csv('test.csv')
print(te)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0      male  34.5      0      0 

Print the columns in the training set so we can identify what columns are going to be useful for this 

In [279]:
tr.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Print the columns in the test set so we can identify what columns it has in common with the training set for further evaluation

In [280]:
te.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Problem 1: I ran into a problem a bit earlier where i had a lot of missing values in my training and test set, so i filled the missing values in the Age column in the training set with the median age by economic class and Sex. 


In [283]:
# Fill missing values with the median age grouped by 'Pclass' and 'Sex'
tr['Age'] = tr.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))

In [284]:
# Clean the 'Sex' column
tr['Sex'] = tr['Sex'].str.strip().str.lower()

# Check unique values again
print(tr['Sex'].unique())

['male' 'female']


In [285]:
# Clean the 'Sex' column
te['Sex'] = te['Sex'].str.strip().str.lower()

# Check unique values again
print(te['Sex'].unique())

['male' 'female']


Problem 2: I ran into another problem where my ml model could not accept a string so i had to convert everything in my Sex columns to 1s and 0s so it would actually work.

In [286]:
te['Sex'] = te['Sex'].map({'male': 0, 'female': 1 })

In [287]:
tr['Sex'] = tr['Sex'].map({'male': 0, 'female': 1 })

Grouped my relevant columns into features and targets. x_train was the features for my training set and y_train was the target for my training set. Then x_test was the features for my test set.

In [288]:
x_train = tr[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]
y_train = tr['Survived']

In [289]:
x_test = te[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]

Looked for missing values

In [290]:
print(x_train.isnull().sum())

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
dtype: int64


In [256]:
print(x_test.isnull().sum())

Pclass     0
Sex        0
Age       86
SibSp      0
Parch      0
dtype: int64


created an instance of random forest classifier

In [291]:
rf = RandomForestClassifier()

i used the fit() method to train my model on my training data features and targets

In [292]:
rf.fit(x_train, y_train)

RandomForestClassifier()

Checked for more missing data

In [294]:
print(x_test.isnull().sum())

Pclass     0
Sex        0
Age       86
SibSp      0
Parch      0
dtype: int64


Since i have some missing data in my age column i put the median of the age in each economic group by age into the missing parts

In [295]:
group_medians = x_train.groupby(['Pclass', 'Sex'])['Age'].median()

In [296]:
x_test['Age'] = x_test.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(group_medians[x.name]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Age'] = x_test.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(group_medians[x.name]))


In [297]:
print(x_test.isnull().sum())

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
dtype: int64


Used the cross valuation score to assess its accuracy, i used 5 scores and found the average of those scores so i can reduce variance

In [298]:
cv_scores = cross_val_score(rf, x_train, y_train, cv = 5, scoring = 'accuracy')

In [299]:
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Cross-Validation Accuracy Scores: [0.77653631 0.81460674 0.83146067 0.80898876 0.83146067]
Mean CV Accuracy: 0.8126106333563493


next i predicted the people that survived based on the trained model using the random forest classifier. 

In [302]:
y_pred = rf.predict(x_test)

then i saved the results to a dataframe called submission

In [303]:
submission = pd.DataFrame({
    'PassengerId': te['PassengerId'],  # Assuming 'PassengerId' is in the test set
    'Survived': y_pred
})

submission.to_csv('submission.csv', index=False)

Then i printed the submission just to check on it.

In [307]:
subs = pd.read_csv('submission.csv')
print(subs)

     PassengerId  Survived
0            892         0
1            893         0
2            894         1
3            895         1
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
