In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline

# Step 1: Read the Dataset
train_data = pd.read_csv('D:/Mudar/AIU/23-2/LAB/ML/Datasets/train.csv')
test_data = pd.read_csv('D:/Mudar/AIU/23-2/LAB/ML/Datasets/test.csv')

# Step 2: Preprocess the Dataset
# Handle Missing Values
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)

test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

# Feature Scaling
numerical_features = ['Age', 'Fare']
scaler = StandardScaler()
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

# Remove Duplicate Data
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)

# Categorical Data
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
test_data['Sex'] = label_encoder.transform(test_data['Sex'])

one_hot_encoder = OneHotEncoder(sparse_output=False)
train_data_encoded = one_hot_encoder.fit_transform(train_data[['Embarked']])
test_data_encoded = one_hot_encoder.transform(test_data[['Embarked']])

train_data = pd.concat([train_data, pd.DataFrame(train_data_encoded, columns=['Embarked_' + str(col) for col in one_hot_encoder.categories_[0]])], axis=1)
test_data = pd.concat([test_data, pd.DataFrame(test_data_encoded, columns=['Embarked_' + str(col) for col in one_hot_encoder.categories_[0]])], axis=1)

train_data.drop(['Embarked', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Embarked', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Split the dataset into features (X) and target (y)
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
# X_test should not contain 'Survived' as it doesn't exist in the test data
X_test = test_data

# Ensure no remaining missing values
print(X_train.isnull().sum())  # Check which columns contain NaN values
print(X_test.isnull().sum())
print(y_train.isnull().sum())

# Handle any remaining NaNs (if any)
X_train.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

# Check again to ensure no remaining NaNs
assert X_train.isnull().sum().sum() == 0, "X_train contains NaN values"
assert X_test.isnull().sum().sum() == 0, "X_test contains NaN values"
assert y_train.isnull().sum() == 0, "y_train contains NaN values"

# Step 3: Model Experimentation
# Select two different machine learning models
models = [
    Pipeline([('clf', LogisticRegression(random_state=42, max_iter=200))]),  # Increased max_iter
    Pipeline([('clf', RandomForestClassifier(random_state=42))])
]

# Evaluate the models using precision and recall metrics
for i, model in enumerate(models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model {i+1}:")
    # Print metrics only if y_test is available (if the test dataset includes labels)
    if 'Survived' in test_data.columns:
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
        print(f"Precision: {precision_score(y_test, y_pred):.3f}")
        print(f"Recall: {recall_score(y_test, y_pred):.3f}")
        print(f"F1-score: {f1_score(y_test, y_pred):.3f}")
        print(f"AUC-ROC: {roc_auc_score(y_test, y_pred):.3f}")
    print()

# Step 4: Model Evaluation
# Choose the best model according to precision and recall metrics
# Bonus: Advanced Techniques (Optional)
# Cross-Validation
logreg_pipeline = Pipeline([('clf', LogisticRegression(random_state=42, max_iter=200))])  # Increased max_iter
rfc_pipeline = Pipeline([('clf', RandomForestClassifier(random_state=42))])

logreg_scores = cross_val_score(logreg_pipeline, X_train, y_train, cv=5, scoring='f1_macro')
rfc_scores = cross_val_score(rfc_pipeline, X_train, y_train, cv=5, scoring='f1_macro')

print("Cross-Validation Results:")
print(f"Logistic Regression: {logreg_scores.mean():.3f} +/- {logreg_scores.std():.3f}")
print(f"Random Forest: {rfc_scores.mean():.3f} +/- {rfc_scores.std():.3f}")

# Hyperparameter Tuning
param_grid = {'clf__C': [0.1, 1, 10], 'clf__penalty': ['l2']}
grid_search = GridSearchCV(logreg_pipeline, param_grid, cv=5, scoring='f1_macro', error_score='raise')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Final Result:
# The final result is the classification report of the best model, which is the Random Forest Classifier in this case.
# The classification report includes precision, recall, and F1-score for each class.
# Only do this if y_test is available
if 'Survived' in test_data.columns:
    print("Final Result:")
    print(classification_report(y_test, models[1].predict(X_test)))


PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64
0
Model 1:



  X_train.fillna(method='ffill', inplace=True)
  X_test.fillna(method='ffill', inplace=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model 2:



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Results:
Logistic Regression: 0.777 +/- 0.019
Random Forest: 0.774 +/- 0.069


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'clf__C': 1, 'clf__penalty': 'l2'}
Best Score: 0.7774182709366906


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
