In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

In [2]:
try:
    dfTitanicFull = pd.read_csv('train.csv')
    print("PREPROCESSING STEPS:")
    print(f"1. Initial missing values:\n{dfTitanicFull[['Age', 'Embarked', 'Cabin']].isnull().sum()}")
    dfQ6 = dfTitanicFull.copy()
    dfQ6['Age'].fillna(dfQ6['Age'].median(), inplace=True)
    dfQ6['Embarked'].fillna(dfQ6['Embarked'].mode()[0], inplace=True)
    dfQ6['Fare'].fillna(dfQ6['Fare'].median(), inplace=True)
    print("\n2. Filled Age with median, Embarked with mode")
    dfQ6['Sex'] = LabelEncoder().fit_transform(dfQ6['Sex'])
    dfQ6['Embarked'] = LabelEncoder().fit_transform(dfQ6['Embarked'])
    print("3. Encoded Sex and Embarked")
    featuresQ6 = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'SibSp', 'Parch']
    XQ6 = dfQ6[featuresQ6]
    YQ6 = dfQ6['Survived']
    XTrainQ6, XTestQ6, yTrainQ6, yTestQ6 = train_test_split(XQ6, YQ6, test_size=0.2, random_state=42)
    dtQ6 = DecisionTreeClassifier(max_depth=5, min_samples_split=20, random_state=42)
    dtQ6.fit(XTrainQ6, yTrainQ6)
    yPredictionQ6 = dtQ6.predict(XTestQ6)
    print(f"\n4. RESULTS AFTER PREPROCESSING:")
    print(f"   Accuracy: {accuracy_score(yTestQ6, yPredictionQ6):.4f}")
    print(f"   Precision: {precision_score(yTestQ6, yPredictionQ6):.4f}")
    print(f"   Recall: {recall_score(yTestQ6, yPredictionQ6):.4f}")
    print("\n5. DISCUSSION:")
    print("   Preprocessing improved model by:")
    print("   - Utilizing more data (no rows dropped due to missing values)")
    print("   - Including more predictive features (Embarked, SibSp, Parch)")
    print("   - Better generalization through proper train-test split")
except FileNotFoundError:
    print("Titanic dataset not found. Please download from Kaggle.")

PREPROCESSING STEPS:
1. Initial missing values:
Age         177
Embarked      2
Cabin       687
dtype: int64

2. Filled Age with median, Embarked with mode
3. Encoded Sex and Embarked

4. RESULTS AFTER PREPROCESSING:
   Accuracy: 0.7933
   Precision: 0.8246
   Recall: 0.6351

5. DISCUSSION:
   Preprocessing improved model by:
   - Utilizing more data (no rows dropped due to missing values)
   - Including more predictive features (Embarked, SibSp, Parch)
   - Better generalization through proper train-test split


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfQ6['Age'].fillna(dfQ6['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfQ6['Embarked'].fillna(dfQ6['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 