# Lab 2 Project (Titanic)
Jason Ballard
19 March 2025

Import the external Python libraries used (e.g., pandas, numpy, matplotlib, seaborn, sklearn and more).

In [None]:
# Data Handling & Visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

# Machine Learning & Model Evaluation
from sklearn.datasets import load_wine
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Dimensionality Reduction
from sklearn.decomposition import PCA

In [None]:
# Load the wine dataset
wine_data = load_wine()
wine = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
wine['class'] = wine_data.target  # Add target variable

## Section 1. Import and Inspect the Data

In [None]:
# Convert to a Pandas DataFrame
wine = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
wine['target'] = wine_data.target  # Add target column

# Display first few rows
print(wine.head())


In [None]:
wine.isnull().sum()


In [None]:
print(wine.describe())


In [None]:
print(wine.corr(numeric_only=True))

### Reflection 1:
1) How many data instances are there?
2) How many features are there?
3) What are the names?
4) Are there any missing values?
5) Are there any non-numeric features?

6) Are the data instances sorted on any of the attributes?
7) What are the units of age?
8) What are the minimum, median and max age?
9) What two different features have the highest correlation?

10) Are there any categorical features that might be useful for prediction?

# Section 2. Data Exploration and Preparation

## 2.1 Explore Data Patterns and Distributions

In [None]:
attributes = ['color_intensity', 'alcohol', 'malic_acid']
scatter_matrix(wine[attributes], figsize=(10, 10))

In [None]:
# Scatter plot: Alcohol vs Color Intensity, colored by Malic Acid values
plt.scatter(
    wine['alcohol'], 
    wine['color_intensity'], 
    c=wine['malic_acid'],  # Color by malic acid values
    cmap='viridis',  # Use a color map for better visualization
    alpha=0.7  # Make points slightly transparent
)

# Correct axis labels and title
plt.xlabel('Alcohol')
plt.ylabel('Color Intensity')
plt.title('Alcohol vs Color Intensity (Colored by Malic Acid)')

# Show the plot
plt.colorbar(label='Malic Acid')  # Add color scale
plt.show()

In [None]:
# Create a histogram for Alcohol content
sns.histplot(wine['alcohol'], kde=True)

# Set title and labels
plt.title('Alcohol Content Distribution')
plt.xlabel('Alcohol')
plt.ylabel('Frequency')

# Show plot
plt.show()

In [None]:
# Add the target column (wine class)
wine['class'] = wine_data.target  # 0, 1, or 2 (wine categories)

# Create a count plot of Wine Class Distribution
sns.countplot(x='class', data=wine, palette='Set2')

# Set title and labels
plt.title('Wine Class Distribution')
plt.xlabel('Wine Class')
plt.ylabel('Count')

# Show plot
plt.show()

### Reflection 2.1:

1. What patterns or anomalies do you notice?
2. Do any features stand out as potential predictors?
3. Are there any visible class imbalances?

## 2.2 Handle Missing Values and Clean Data

In [None]:
# Check for missing values
print("Missing values before imputation:\n", wine.isnull().sum())

# Fill missing numerical values with the median (if any)
wine.fillna(wine.median(), inplace=True)

# Check for missing values after imputation
print("\nMissing values after imputation:\n", wine.isnull().sum())



## 2.3 Feature Engineering

In [None]:
# Create a new feature: Total Phenolic Contribution
wine['phenolic_contribution'] = wine['total_phenols'] + wine['flavanoids']

# Display first few rows to check the new feature
print(wine[['total_phenols', 'flavanoids', 'phenolic_contribution']].head())

### Reflection 2.3

1. Why might family size be a useful feature for predicting survival?
2. Why convert categorical data to numeric?

# Section 3. Feature Selection and Justification
- Select two or more input features (numerical for regression, numerical and/or categorical for classification)
- Select a target variable (as applicable)
- Classification: Categorical target variable (e.g., gender, species).
- Justify your selection with reasoning.

## 3.1 Choose features and target

In [None]:
# Add the target (wine class: 0, 1, 2)
wine['class'] = wine_data.target  

# Create a new feature: Total Phenolic Contribution (sum of total_phenols and flavanoids)
wine['phenolic_contribution'] = wine['total_phenols'] + wine['flavanoids']

# Select relevant features for classification
features = ['alcohol', 'malic_acid', 'phenolic_contribution', 'color_intensity']
target = 'class'

# Extract relevant columns
wine_classification = wine[features + [target]]

# Drop rows with missing values (if any)
wine_classification = wine_classification.dropna()

# Display processed dataset
print(wine_classification.head())

## 3.2 Define X and y

In [None]:
# Define features (X) and target (y)
X = wine[['alcohol', 'malic_acid', 'color_intensity', 'total_phenols', 'flavanoids']]
y = wine['class']

### Reflection 3:

1. Why are these features selected?
2. Are there any features that are likely to be highly predictive of survival?

# Section 4. Splitting

## 4.1 Basic Train/Test split 

In [None]:
# Assign input features to X
X = wine[['alcohol', 'malic_acid', 'color_intensity', 'total_phenols', 'flavanoids']]

# Assign target variable to y
y = wine['class']

# Perform train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Print dataset sizes
print('Train size:', len(X_train))
print('Test size:', len(X_test))

## 4.2 Stratified Train/Test split

In [None]:
# Use StratifiedShuffleSplit to ensure balanced class distribution
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)

for train_indices, test_indices in splitter.split(X, y):
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

## 4.3 Compare Results


In [None]:
# Compare class distributions before and after splitting
print("Original Class Distribution:\n", y.value_counts(normalize=True))
print("Train Set Class Distribution:\n", y_train.value_counts(normalize=True))
print("Test Set Class Distribution:\n", y_test.value_counts(normalize=True))

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=123)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### Reflection 4:

1. Why might stratification improve model performance?
    Stratification ensures that the class proportions in the training and test sets closely match the original dataset distribution. This is especially important for imbalanced datasets, where one class might be underrepresented. Without stratification, the model might learn biased patterns and perform poorly on minority classes.
2. How close are the training and test distributions to the original dataset?
    Stratified sampling helps maintain similar distributions between the training and test sets. The closer these distributions are to the original dataset, the more representative the model's evaluation will be.
3. Which split method produced better class balance?
    Stratified Shuffle Split is generally better for class balance, ensuring the model sees all classes proportionally during training and evaluation.


In [None]:
# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Random Forest model
rf = RandomForestClassifier(random_state=123)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

In [None]:
# Train the best Random Forest model from Grid Search
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

# Get feature importances
importances = best_rf.feature_importances_
feature_names = X_train.columns

# Sort feature importance values
indices = np.argsort(importances)[::-1]

# Plot feature importance
plt.figure(figsize=(8, 6))
plt.title("Feature Importance in Random Forest")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), feature_names[indices], rotation=45)
plt.xlabel("Feature")
plt.ylabel("Importance Score")
plt.show()

In [None]:
# Reduce features to 2 principal components
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a Random Forest model on the transformed data
rf_pca = RandomForestClassifier(n_estimators=100, random_state=123)
rf_pca.fit(X_train_pca, y_train)

# Create scatter plot of PCA-transformed data
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_train_pca[:, 0], y=X_train_pca[:, 1], hue=y_train, palette='viridis', alpha=0.7)
plt.title("PCA-Transformed Training Data (Colored by Class)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Class")
plt.show()