## Install all the necessary Libraries

In [None]:
%pip install pandas
%pip install xlrd
%pip install matplotlib
%pip install seaborn
%pip install numpy
%pip install scikit-learn
%pip install imblearn

## Import The Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

## Tasks:

### Task 1: Data Loading and Initial Exploration

Lecture material: Lecture 3, slides 4–8, 10, and 11.

- Load the dataset into a Pandas DataFrame.
- Perform basic exploratory data analysis (EDA) to comprehend the structure and characteristics of the data.

Note: Your analysis should include appropriate exploratory statistics and visualizations.

In [None]:
# Load the Titanic dataset
df = pd.read_excel('titanic3.xls')

# Display the first few rows of the dataset
print(df.head())

# Get summary info of the dataset 
print(df.info())

# Get descriptive statistics for numerical valeus
print(df.describe())


### Task 2: Managing Missing Values

Lecture Material: Lecture 3, slides 22–24.

- Identify the columns containing missing values.
- Develop a strategy to address them.


In [None]:
# Check for missing values in each column
missing_data = df.isnull().sum()

# Display the columns that have missing values and how many
missing_data = missing_data[missing_data > 0]
print(missing_data)

## Filling missing values
## I decided to simply fill the missing values with the mode and means rather than erase them
# Imputing numerical columns with median values
df['age'] = df['age'].fillna(df['age'].median())
df['fare'] = df['fare'].fillna(df['fare'].median())
df['body'] = df['body'].fillna(df['body'].median())

# Imputing categorical columns with the mode value
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
df['cabin'] = df['cabin'].fillna(df['cabin'].mode()[0])
df['boat'] = df['boat'].fillna(df['boat'].mode()[0])
df['home.dest'] = df['home.dest'].fillna(df['home.dest'].mode()[0])


### Task 3: Encoding Categorical Variables

Lecture material: Lecture 4, slides 10–15, 21.

- Identify the categorical variables in the dataset.
- Utilize OneHotEncoder to encode them.
- Observe the transformation and discuss its impact on machine learning models.


In [None]:
# Define the categorical columns for encoding
categorical_columns = ['sex',  'embarked', 'survived', 'pclass']

# Create a ColumnTransformer that applies OneHotEncoder to the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'  
)

# Apply the encoding to the dataset using a pipeline and convert them to a DataFrame
df_encoded = preprocessor.fit_transform(df)
encoded_columns = preprocessor.transformers_[0][1].get_feature_names_out(categorical_columns)
df_encoded = pd.DataFrame(df_encoded, columns=np.append(encoded_columns, df.columns[len(categorical_columns):]))

# Check the result
print(df_encoded.head())

### Task 4: Feature Scaling

Lecture material: Lecture 5, slides 14–20.

- Standardize the numerical variables using StandardScaler.
- Normalize the numerical variables using MinMaxScaler.
- Discuss the differences between standardization and normalization, along with their importance.



In [None]:
# Define the numerical columns to standardize and Fit and transform the numerical columns to standardize them
numerical_columns = ['age', 'fare', 'sibsp', 'parch', 'body']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Check the result
print(df[numerical_columns].head())

# Create the MinMaxScaler instance and Fit and transform the numerical columns to normalize them
min_max_scaler = MinMaxScaler()
df[numerical_columns] = min_max_scaler.fit_transform(df[numerical_columns])

# Check the result
print(df[numerical_columns].head())


### Task 5: Data Splitting

Lecture material: Lecture 2, slides 4–7.

- Split the dataset into training, validation, and test sets.
- Ensure that the split reflects the original distribution of the target variable using stratification.

**Note**: a good strategy is to first split the dataset into ‘training’ and ‘others’, and then split ‘others’ into equally sized ‘validation’ and ‘test’ sets. When splitting sets, consider the argument stratify of the train test split
method.


In [None]:
X = df.drop(columns=['survived'])  # Features (excluding target variable)
y = df['survived']  # Target variable (survival)

# Split into training and 'others' (validation and test)
X_train, X_others, y_train, y_others = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

# Split 'others' into validation and test sets, maintaining target distribution
X_val, X_test, y_val, y_test = train_test_split(X_others, y_others, test_size=0.5, stratify=y_others, random_state=42)

# Check the distribution of the target variable in each split
print("Training set target distribution:")
print(y_train.value_counts(normalize=True))

print("\nValidation set target distribution:")
print(y_val.value_counts(normalize=True))

print("\nTest set target distribution:")
print(y_test.value_counts(normalize=True))


### Task 6: Addressing Class Imbalance

Lecture material: Lecture 3, slides 25–27; Lecture 4, slides 4–5.

- Apply a method to address class imbalance (e.g., Oversampling Technique (SMOTE), Adaptive Synthetic
Sampling Method (ADASYN)).

**Note**: You can load a SMOTE and/or ADASYN implementation from the Python module imblearn.


In [None]:
## Prepare the dataset
# Encode categorical variables (if any) and handle missing values
df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop(columns=['pclass'])
y = df_encoded['pclass']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check the class distribution before applying SMOTE
print("Original class distribution in the training set:")
print(y_train.value_counts(normalize=True))

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_test_imputed = imputer.transform(X_test_scaled)

## Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed, y_train)

# Check class distribution after SMOTE
print("\nClass distribution after applying SMOTE:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))


### Task 7: Feature Selection

Lecture material: Lecture 5, slides 10–14, 19.

- Eliminate low variance and highly correlated features.
- Why do we carry out tasks 6 and 7 after splitting the dataset into training, validation, and test sets? Could
we have conducted them on the entire dataset instead? Please elaborate on your answer.


In [None]:
## Drop non-numeric columns from X_train 
X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])

# Step 2: Remove Low Variance Features
variance_threshold = VarianceThreshold(threshold=0.01)  # (Set a threshold to remove low variance features)
X_train_no_low_variance = variance_threshold.fit_transform(X_train_numeric)

# Step 3: Get the feature names of the remaining features after variance reduction
remaining_features = X_train_numeric.columns[variance_threshold.get_support()]
print(f"Remaining features after variance thresholding: {remaining_features}")

# Create a new DataFrame with only the selected features
X_train_no_low_variance_df = pd.DataFrame(X_train_no_low_variance, columns=remaining_features)



### Task 8: Training a Logistic Regression Model

Lecture material: Lecture 6, slides 5–9.

- Train a Logistic Regression Model to predict whether a passenger survives.

**Note**: Use the method predict from the class LogisticRegression with the validation set. Have fun finding
a visually appealing way to display the results of the predictions on the validation set. An analysis of model
performance is not required and will not affect your final grade for the assignment. However, I won’t stop you from
including it. 

In [None]:
# Loading the Titanic3.xls dataset
df = pd.read_excel('titanic3.xls')

# Handling missing values 
df['age'] = df['age'].fillna(df['age'].median())
df['fare'] = df['fare'].fillna(df['fare'].median())
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
df['cabin'] = df['cabin'].fillna(df['cabin'].mode()[0])
df['boat'] = df['boat'].fillna(df['boat'].mode()[0])
df = df.dropna(subset=['survived', 'sex'])

# Encoding categorical features using one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'embarked', 'cabin'], drop_first=True)

# Selecting relevant features (excluding 'name', 'ticket' that dont affect the survival)
X = df[['age', 'fare', 'sibsp', 'parch', 'sex_male', 'embarked_Q', 'embarked_S']]
y = df['survived']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the Logistic Regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train)

# Making predictions on the test set
y_pred = logreg.predict(X_test_scaled)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Displaying results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
