In [43]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import chi2_contingency

# Step 2: Load the Dataset
# Load the Titanic dataset from a CSV file
url = 'https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv'
data = pd.read_csv(url)
data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
data = data.drop(['PassengerId'], axis= 1)
display(data.head())

# Step 3: Preprocess the Data
# Handle missing values
data.fillna(method='bfill', inplace=True)

# Convert categorical columns to numeric
data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

# Define features and target
X = data.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)  # Dropping non-numeric features
y = data['Survived']

# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


  data.fillna(method='bfill', inplace=True)


In [45]:

# Step 5: Chi-Square Test for feature evaluation
alpha = 0.05  # Significance level
selected_features = []  # List to hold features that pass the Chi-Square test

print("\nChi-Square Test Results:")
for column in X.columns:
    # Create a contingency table
    contingency_table = pd.crosstab(data[column], data['Survived'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f'Feature: {column}, Chi-Squared: {chi2:.4f}, p-value: {p:.4f}')
    
    # Check if the p-value is less than the significance level
    if p < alpha:
        selected_features.append(column)
print(f"\nSelected Features (p < {alpha}): {selected_features}")


# Filter the dataset to include only the selected features
X_selected = X[selected_features]

# Step 6: Split the Data with Selected Features
X_train_selected, X_test_selected, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)



Chi-Square Test Results:
Feature: Pclass, Chi-Squared: 102.8890, p-value: 0.0000
Feature: Age, Chi-Squared: 102.0247, p-value: 0.1294
Feature: SibSp, Chi-Squared: 37.2718, p-value: 0.0000
Feature: Parch, Chi-Squared: 27.9258, p-value: 0.0001
Feature: Fare, Chi-Squared: 425.8084, p-value: 0.0000
Feature: Sex_male, Chi-Squared: 260.7170, p-value: 0.0000
Feature: Embarked_Q, Chi-Squared: 0.0000, p-value: 1.0000
Feature: Embarked_S, Chi-Squared: 20.0725, p-value: 0.0000

Selected Features (p < 0.05): ['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_S']


In [46]:

# Step 7: Train a Decision Tree with Selected Features
clf_chi2 = DecisionTreeClassifier(random_state=42)
clf_chi2.fit(X_train_selected, y_train)

# Step 8: Predictions and Evaluation with Selected Features
y_pred_chi2 = clf_chi2.predict(X_test_selected)
print("\nDecision Tree Performance with Selected Features:")
print(classification_report(y_test, y_pred_chi2))
print(confusion_matrix(y_test, y_pred_chi2))



Decision Tree Performance with Selected Features:
              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

[[91 14]
 [22 52]]


In [47]:

# Step 9: Train a Decision Tree without considering the Chi-Square Test
clf_no_chi2 = DecisionTreeClassifier(random_state=42)
clf_no_chi2.fit(X_train, y_train)

# Step 10: Predictions and Evaluation without Chi-Square Test
y_pred_no_chi2 = clf_no_chi2.predict(X_test)
print("\nDecision Tree Performance without Chi-Square Test:")
print(classification_report(y_test, y_pred_no_chi2))
print(confusion_matrix(y_test, y_pred_no_chi2))


Decision Tree Performance without Chi-Square Test:
              precision    recall  f1-score   support

           0       0.86      0.79      0.82       105
           1       0.73      0.81      0.77        74

    accuracy                           0.80       179
   macro avg       0.79      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179

[[83 22]
 [14 60]]
