In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

In [4]:
df = pd.read_csv(r"E:\Downloads\tested.csv")
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [5]:
print("Missing values before handling:\n", df.isnull().sum())

Missing values before handling:
 Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [6]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df.dropna(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


In [7]:
label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

In [8]:
df.drop_duplicates(inplace=True)
print(f"Number of duplicates: {df.duplicated().sum()}")

Number of duplicates: 0


In [9]:
scaler = MinMaxScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,0.452723,0,0,0.015282,1
1,1,3,0,0.617566,1,0,0.013663,2
2,0,2,1,0.815377,0,0,0.018909,1
3,0,3,1,0.353818,0,0,0.016908,2
4,1,3,0,0.287881,1,1,0.023984,2


In [10]:
X = df.drop('Survived', axis=1)
Y = df['Survived']

In [11]:
mutual_info = mutual_info_classif(X, Y)
mutual_info_series = pd.Series(mutual_info, index=X.columns).sort_values(ascending=False)
print("Mutual Information Scores:\n", mutual_info_series)

Mutual Information Scores:
 Sex         0.665359
Fare        0.036934
Parch       0.023139
Embarked    0.015684
Age         0.006377
Pclass      0.000000
SibSp       0.000000
dtype: float64


In [12]:
bestfeatures = SelectKBest(score_func=chi2, k=7)
fit = bestfeatures.fit(X, Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Feature', 'Score']
print("Best features based on Chi-square:\n", featureScores.nlargest(3, 'Score'))

Best features based on Chi-square:
   Feature       Score
1     Sex  144.000000
4   Parch   21.302206
3   SibSp    5.252814


In [13]:
XX = df[['Sex', 'Fare', 'Pclass', 'Embarked']]

In [14]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 15, 20, 50, 100],
}

In [15]:
model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, param_grid, cv=10)
grid_search.fit(XX, Y)
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

Best hyperparameters: {'criterion': 'gini', 'max_depth': None}


In [16]:
model = DecisionTreeClassifier(**best_params)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(XX, Y, test_size=0.2, random_state=42)

In [18]:
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Accuracy: {accuracy}")

Mean Squared Error: 0.0
Accuracy: 1.0


In [20]:
f1_scores = cross_val_score(model, XX, Y, cv=10, scoring='f1')
print("F1 Scores:", f1_scores)
print("Mean F1 Score:", f1_scores.mean())

F1 Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean F1 Score: 1.0
