In [None]:
import numpy as np 
import pandas as pd 
import os

#Import the data (the training and test data)

train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")


In [None]:
#clean the two-datasets (there are some missing values (for example cabin, age ))

#fill in missing age with the median age
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

#fill in Embarked with the most frequent value
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])

# Drop the Cabin column because it has too many missing values
train_df.drop(columns=['Cabin'], inplace=True)
test_df.drop(columns=['Cabin'], inplace=True)

In [None]:
#Convert categories to numerical values before training the ML models

train_df['Embarked'] = train_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
test_df['Embarked'] = test_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

In [None]:
#Other extra things added/dropped

#added FamilySize to factor in the survivals
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']

#dropped tickets, name, passengerID (they don't provide value to the model)

train_df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)
test_df.drop(columns=['Name', 'Ticket'], inplace=True)

In [None]:
#Random Forest model

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Separate features (X) and target (y)
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model (In this case the RandomForest model)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)

# Evaluate the accuracy
print(f'Accuracy: {accuracy:.4f}')

In [None]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(max_iter=200, random_state=42)

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions
y_pred_logreg = logreg_model.predict(X_val)

# Evaluate the accuracy
accuracy_logreg = accuracy_score(y_val, y_pred_logreg)
print(f'Logistic Regression Accuracy: {accuracy_logreg:.4f}')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Feature Engineering
def feature_engineering(df):
    # Age Group
    df['Age_Group'] = pd.cut(df['Age'], bins=[0, 12, 18, 30, 50, 80, 100], labels=['Child', 'Teen', 'Young_Adult', 'Adult', 'Middle_Aged', 'Senior'])
    df['Age_Group'] = df['Age_Group'].astype('category').cat.codes
    
    # Fare Group
    df['Fare_Group'] = pd.qcut(df['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very_High'])
    df['Fare_Group'] = df['Fare_Group'].astype('category').cat.codes
    
    # Family Size and Is_Alone
    df['Family_Size'] = df['SibSp'] + df['Parch']
    df['Is_Alone'] = df['Family_Size'].apply(lambda x: 1 if x == 0 else 0)
    
    return df

# Apply feature engineering to both train and test sets
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# Prepare the features and target
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# Evaluate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy after feature engineering: {accuracy:.4f}')


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize Random Forest with best parameters (hyperparameters)
best_rf_model = RandomForestClassifier(
    bootstrap=True, 
    max_depth=10, 
    max_features='auto', 
    min_samples_leaf=2, 
    min_samples_split=2, 
    n_estimators=200, 
    random_state=42
)

# Train the model
best_rf_model.fit(X_train, y_train)

# Make predictions
y_pred_best_rf = best_rf_model.predict(X_val)

# Evaluate accuracy
best_rf_accuracy = accuracy_score(y_val, y_pred_best_rf)
print(f'Random Forest Accuracy with Best Parameters: {best_rf_accuracy:.4f}')

model = best_rf_model

In [None]:
# (trying a different appraoch with xgboost model from professor rec)
!pip install shap xgboost scikit-learn pandas numpy matplotlib seaborn

In [None]:

import numpy as np
import pandas as pd
import shap
import os
import xgboost as xgb
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

# Preserve PassengerId for final predictions
test_passenger_ids = test_df['PassengerId']

# Feature Engineering - Extract Titles
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Normalize rare titles
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train_df['Title'] = train_df['Title'].replace(rare_titles, 'Rare')
test_df['Title'] = test_df['Title'].replace(rare_titles, 'Rare')

# Encode categorical features
for col in ['Sex', 'Embarked', 'Title']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))




In [None]:
# Fill missing values
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
train_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)
test_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)


# Create new features
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

train_df['Age*Pclass'] = train_df['Age'] * train_df['Pclass']
test_df['Age*Pclass'] = test_df['Age'] * test_df['Pclass']


# Drop unnecessary columns
train_df.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)
test_df.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)


In [None]:
# Feature selection using SHAP
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

model = xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
model.fit(X, y)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

shap_importance = np.abs(shap_values).mean(axis=0)
selected_features = X.columns[np.argsort(shap_importance)[-8:]]  # Keep top 8 features

X = X[selected_features]
test_df = test_df[selected_features]


In [None]:
# Train final model with manually selected hyperparameters
model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    min_child_weight=2,
    random_state=42
)
model.fit(X, y)

# Evaluate final model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'Final Model Accuracy: {accuracy:.4f}')


In [None]:
#Cross validation to see accuracy accross other models

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

average_accuracy = np.mean(cv_scores)
print(f'Average cross-validated accuracy: {average_accuracy:.4f}')

In [None]:
# Make final predictions on test dataset
test_predictions = model.predict(test_df)

# Create a DataFrame with PassengerId and predicted Survived values
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': test_predictions
})

# Save as CSV file
submission.to_csv('submission.csv', index=False)

# Display first few rows of submission file
submission.head()


In [None]:
#Visulize the results and work so far
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the distribution of the target variable 'Survived'
plt.figure(figsize=(6, 4))
sns.countplot(x='Survived', data=train_df)
plt.title('Survival Distribution')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()


In [None]:
# Plot the distribution of Age
plt.figure(figsize=(6, 4))
sns.histplot(train_df['Age'].dropna(), bins=30, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Plot the distribution of Fare
plt.figure(figsize=(6, 4))
sns.histplot(train_df['Fare'], bins=30, kde=True)
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Plot Embarked vs Survival
plt.figure(figsize=(6, 4))
sns.countplot(x='Embarked', hue='Survived', data=train_df)
plt.title('Survival by Embarked')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.show()



In [None]:
#Reflection:

"""
For Lab one, being able to visualize data and use machine learning models was an exciting experience. I got to use
pandas and seaborn to create graphs, and use Random Forest model to predict survivors. I previously took ds 201, so
this wasn't to new to me, but it was still fun to practice feature engineering to improve my accuracy. For the future,
I would have used a different ML model (logistic regression since it is used for classification). Using other technqiues
like one hot encoding and hyper parameter tuning, I would also like to do in the feature. I would also like to improve my
cross validations to check for accuracy across all models. these are the improvments I would do in the next potential 
lab we do. 
"""