# Import Libraries and Load Data

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv(r'C:\Users\asmaj\Downloads\DATA SCIENCE\Titanic-Dataset.csv')

# Data Exploration
print("Dataset Info:\n", data.info())
print("\nFirst few rows:\n", data.head())

# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

# Feature Engineering
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace('Mlle', 'Miss')
data['Title'] = data['Title'].replace('Ms', 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')
data['CabinDeck'] = data['Cabin'].str[0]
data['FareBin'] = pd.qcut(data['Fare'], 4, labels=['Low', 'Med', 'High', 'Very_High'])

# Define the features and target
X = data.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'])
y = data['Survived']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Dataset Info:
 None

First few rows:
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0  

In [16]:
# Load the dataset
data = pd.read_csv(r'C:\Users\asmaj\Downloads\DATA SCIENCE\Titanic-Dataset.csv')

# Data Exploration
print("Dataset Info:\n", data.info())
print("\nFirst few rows:\n", data.head())

# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Dataset Info:
 None

First few rows:
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0  

# Define Preprocessing Pipelines

In [23]:
# Define columns for preprocessing
numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'CabinDeck', 'FareBin']

# Preprocessing Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Preprocess and Handle Class Imbalance

In [24]:
# Convert preprocessed NumPy array back to a DataFrame
def to_dataframe(array, columns):
    return pd.DataFrame(array, columns=columns)

# Apply the preprocessor to X and handle missing values
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed_df = to_dataframe(X_preprocessed, preprocessor.get_feature_names_out())

# Handle class imbalance using SMOTE on the preprocessed DataFrame
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_preprocessed_df, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# Model Definition and Hyperparameter Tuning

In [25]:
# Initialize the XGBoost classifier without reapplying the preprocessor
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Output the best parameters and score from GridSearchCV
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


190 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\asmaj\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\asmaj\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "C:\Users\asmaj\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache

Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best CV Score: 0.8280194805194805


Parameters: { "use_label_encoder" } are not used.



# Model Evaluation

In [26]:
# Make predictions on the test set using the best model
y_pred = grid_search.best_estimator_.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy on Test Set:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy on Test Set: 0.8818181818181818

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       103
           1       0.89      0.89      0.89       117

    accuracy                           0.88       220
   macro avg       0.88      0.88      0.88       220
weighted avg       0.88      0.88      0.88       220


Confusion Matrix:
 [[ 90  13]
 [ 13 104]]


## Model Performance 

### Accuracy
- **Accuracy**: 88.18%  
  This is generally considered good for the Titanic dataset, where most models fall in the range of 75-85%. However, accuracy alone may not fully capture the model's performance, especially if the dataset has class imbalance.

### Precision and Recall

#### Class 0 (Did Not Survive)
- **Precision**: 87%  
  Of all predicted non-survivors, 87% were actual non-survivors.
- **Recall**: 87%  
  The model correctly identifies 87% of all actual non-survivors.

#### Class 1 (Survived)
- **Precision**: 89%  
  Of all predicted survivors, 89% were actual survivors.
- **Recall**: 89%  
  The model correctly identifies 89% of all actual survivors.

### F1-Score
- Both classes have an **F1-score** of around 0.88, which represents a balance between precision and recall. These balanced F1-scores indicate that the model performs well in distinguishing between survivors and non-survivors.

### Confusion Matrix
- Out of **220 samples**, only **26 misclassifications** occurred (13 false positives and 13 false negatives). This shows the model's consistency in correctly identifying both classes.
