# Vector Project - Centralized Unsupervised Classifier

## Installs and Import

In [5]:
#pip install tabulate

Looking in indexes: https://artifactory.fg.rbc.com/artifactory/api/pypi/pypi-remote/simple, https://artifactory.fg.rbc.com/artifactory/api/pypi/pypi/simple
Collecting tabulate
  Downloading https://artifactory.fg.rbc.com/artifactory/api/pypi/pypi/packages/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
#pip install sickit-learn

Looking in indexes: https://artifactory.fg.rbc.com/artifactory/api/pypi/pypi-remote/simple, https://artifactory.fg.rbc.com/artifactory/api/pypi/pypi/simple
[31mERROR: Could not find a version that satisfies the requirement sickit-learn (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sickit-learn[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, minmax_scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
# Read the CSV files into Pandas DataFrames
df_v3 = pd.read_csv("/home/jovyan/vector_fl_bootcamp_2025/data/20250314_synthetic_flight_data_v3_50000 (1).csv")
df_v4 = pd.read_csv("/home/jovyan/vector_fl_bootcamp_2025/data/20250314_synthetic_flight_data_v4_50000.csv")
df_v5 = pd.read_csv("/home/jovyan/vector_fl_bootcamp_2025/data/20250314_synthetic_flight_data_v5_50000.csv")

# Display the first 5 rows of each DataFrame
print("First 5 rows of v3 DataFrame:")
print(df_v3.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nFirst 5 rows of v4 DataFrame:")
print(df_v4.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nFirst 5 rows of v5 DataFrame:")
print(df_v5.head().to_markdown(index=False, numalign="left", stralign="left"))

First 5 rows of v3 DataFrame:
| Date       | sales_channel   | trip_type   | purchase_lead   | length_of_stay   | flight_hour   | flight_day   | origin    | destination   | wants_extra_baggage   | wants_preferred_seat   | wants_in_flight_meals   | flight_duration   | booking_complete   | airline_carrier   | age   | gender   | credit_score   | income   | social_group   | life_stage              | ticket_type   | client_segment   | flight_cost   |
|:-----------|:----------------|:------------|:----------------|:-----------------|:--------------|:-------------|:----------|:--------------|:----------------------|:-----------------------|:------------------------|:------------------|:-------------------|:------------------|:------|:---------|:---------------|:---------|:---------------|:------------------------|:--------------|:-----------------|:--------------|
| 2025-03-06 | Online          | Round Trip  | 172             | 10               | 2             | Sunday       | Halifax   | Lon

## Data Exploration

In [3]:
# Combine the three DataFrames into one
combined_df = pd.concat([df_v3, df_v4, df_v5], ignore_index=True)
# 1. Data Exploration
print("Data Exploration:\n")
print("Shape of the combined DataFrame:", combined_df.shape)
print("\nData types of each column:\n", combined_df.dtypes)
print("\nDescriptive statistics of the combined DataFrame:\n", combined_df.describe().to_markdown(numalign="left", stralign="left"))

Data Exploration:

Shape of the combined DataFrame: (150000, 24)

Data types of each column:
 Date                      object
sales_channel             object
trip_type                 object
purchase_lead              int64
length_of_stay             int64
flight_hour                int64
flight_day                object
origin                    object
destination               object
wants_extra_baggage        int64
wants_preferred_seat       int64
wants_in_flight_meals      int64
flight_duration          float64
booking_complete           int64
airline_carrier           object
age                        int64
gender                    object
credit_score               int64
income                     int64
social_group              object
life_stage                object
ticket_type               object
client_segment            object
flight_cost               object
dtype: object

Descriptive statistics of the combined DataFrame:
 |       | purchase_lead   | length_of_stay   | f

## Feature Engineering

In [4]:
# Feature Engineering
print("\nFeature Engineering:\n")

# Convert flight_cost to numeric categories 1-4
def convert_flight_cost(cost_str):
    if cost_str == '$':
        return 1
    elif cost_str == '$$':
        return 2
    elif cost_str == '$$$':
        return 3
    elif cost_str == '$$$$':
        return 4
    else:
        return None  # Handle potential errors or unknown values

combined_df['flight_cost_num'] = combined_df['flight_cost'].apply(convert_flight_cost)
combined_df['flight_cost_num'] = combined_df['flight_cost_num'].fillna(combined_df['flight_cost_num'].mode()[0])

# Create dummy variables for flight_cost_num - not needed for ordinal data
#combined_df = pd.get_dummies(combined_df, columns=['flight_cost_num'], prefix='flight_cost', drop_first=True)

# Date to Day of Year (Cyclical Representation)
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
days_in_year = combined_df['Date'].dt.isocalendar().day.max()

#Date to week of year
combined_df['week_num'] = combined_df['Date'].dt.isocalendar().week

#combined_df['Day_of_Year_sin'] = np.sin(2 * np.pi * combined_df['Date'].dt.isocalendar().day / days_in_year) #date sin
#combined_df['Day_of_Year_cos'] = np.cos(2 * np.pi * combined_df['Date'].dt.isocalendar().day / days_in_year) #date cos

# Flight Day to Weekday/Weekend
combined_df['Day_Type'] = combined_df['flight_day'].apply(lambda x: 'Weekend' if x in ['Saturday', 'Sunday'] else 'Weekday')

# Origin/Destination to Hub/Not Hub (Example hub list - this can be expanded)
#major_hubs = ['London', 'Chicago', 'Los Angeles', 'New York', 'San Francisco', 'Toronto']

#def is_hub(city):
#    return 'Hub' if city in major_hubs else 'Not_Hub'
#
#combined_df['Origin_Hub'] = combined_df['origin'].apply(is_hub)
#combined_df['Destination_Hub'] = combined_df['destination'].apply(is_hub)

# Airline Carrier to Premium/Discount (Example mapping - this can be expanded)
#airline_mapping = {
#    'United Airlines': 'Premium',
#    'Westjet': 'Discount',
#    'Air India': 'Discount',
#    'Emirates': 'Premium',
#    'British Airways': 'Premium',
#    'Lufthansa': 'Premium',
#    'Southwest': 'Discount'
#}

#combined_df['Airline_Type'] = combined_df['airline_carrier'].map(airline_mapping).fillna('Other')  # Handle airlines not in the mapping

# Drop original columns
#combined_df = combined_df.drop(columns=['Date', 'flight_day', 'origin', 'destination', 'airline_carrier', 'flight_cost'])
combined_df = combined_df.drop(columns=['Date', 'flight_day','flight_cost']) #revised

# Identify categorical columns for one-hot encoding (after transformations)
categorical_cols = [col for col in combined_df.columns if combined_df[col].dtype == 'object']

# Perform one-hot encoding
combined_df = pd.get_dummies(combined_df, columns=categorical_cols, drop_first=True)

# Display the shape of the transformed DataFrame
print("\nShape of the transformed DataFrame:", combined_df.shape)


Feature Engineering:


Shape of the transformed DataFrame: (150000, 49)


In [5]:
# Display the first 5 rows of the transformed DataFrame
combined_df.head()

Unnamed: 0,purchase_lead,length_of_stay,flight_hour,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,age,credit_score,...,airline_carrier_Westjet,gender_Male,social_group_Urban,life_stage_Parent,life_stage_Retired,life_stage_Single,life_stage_Young Professional,ticket_type_economy,client_segment_premium,Day_Type_Weekend
0,172,10,2,1,1,1,6.5,0,22,661,...,0,1,1,1,0,0,0,1,0,1
1,43,3,2,0,1,1,6.5,0,46,586,...,1,1,0,0,0,0,0,0,0,0
2,31,0,4,1,0,0,5.6,1,23,674,...,0,0,1,0,0,0,1,0,0,1
3,249,2,17,0,0,1,12.7,0,56,679,...,0,1,1,0,0,0,0,1,1,0
4,182,0,0,0,0,0,11.5,0,55,799,...,0,0,0,0,0,1,0,1,0,1


## PCA

In [6]:
# 4. PCA for Feature Selection
print("\nPCA for Feature Selection:\n")

# Separate features (X) from the target variable (if any)
X = combined_df  # Assuming all columns are features

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA
pca = PCA()
pca.fit(X_scaled)

# Explained variance ratio
explained_variance = pca.explained_variance_ratio_

# Get feature names
feature_names = X.columns

# Create a DataFrame to hold explained variance ratios
explained_variance_df = pd.DataFrame({'Component': range(1, len(explained_variance) + 1),
                                      'Explained Variance': explained_variance})

# Assign feature names to components
explained_variance_df['Feature'] = feature_names[:len(explained_variance_df)]

print("\nExplained variance ratio with feature names:\n")
print(explained_variance_df.head(20).to_markdown(index=False, numalign="left", stralign="left"))

# Cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance)
print("\nCumulative explained variance:\n", cumulative_explained_variance)

# Determine the number of components that explain 95% of the variance
n_components_85 = np.argmax(cumulative_explained_variance >= 0.85) + 1
print(f"\nNumber of components explaining 85% of variance: {n_components_85}")


PCA for Feature Selection:


Explained variance ratio with feature names:

| Component   | Explained Variance   | Feature               |
|:------------|:---------------------|:----------------------|
| 1           | 0.0384935            | purchase_lead         |
| 2           | 0.0306222            | length_of_stay        |
| 3           | 0.0255827            | flight_hour           |
| 4           | 0.0255481            | wants_extra_baggage   |
| 5           | 0.0255235            | wants_preferred_seat  |
| 6           | 0.0251893            | wants_in_flight_meals |
| 7           | 0.0240997            | flight_duration       |
| 8           | 0.023926             | booking_complete      |
| 9           | 0.0238853            | age                   |
| 10          | 0.0238393            | credit_score          |
| 11          | 0.0238198            | income                |
| 12          | 0.0237941            | flight_cost_num       |
| 13          | 0.0232496            | wee

## Pre-Processing

In [7]:
#Filter to features accounting for 85% of columative variance
relevant_col_list = explained_variance_df['Feature'].values.tolist()[:n_components_85]
pca_filtered_df = combined_df.filter(items =  relevant_col_list)

In [8]:
#Normalize values to be between 0 and 1
columns_to_scale = [col for col in pca_filtered_df.columns if pca_filtered_df[col].max() > 1]
pca_filtered_df[columns_to_scale] = minmax_scale(pca_filtered_df[columns_to_scale])

## Autoencoder

In [9]:
# Define the Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, latent_dim)  # Compressed Representation
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32), nn.ReLU(),
            nn.Linear(32, 64), nn.ReLU(),
            nn.Linear(64, input_dim)  # Output should match input_dim
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

# Define dimensions
input_dim = pca_filtered_df.shape[1]  # Should match PCA components
latent_dim = 5  # Choose small enough for clustering
model = Autoencoder(input_dim=input_dim, latent_dim=latent_dim)


# Convert PCA-transformed data to tensor
X_tensor = torch.FloatTensor(pca_filtered_df.values)

# Define optimizer and loss function
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    reconstructed, latent_features = model(X_tensor)
    loss = criterion(reconstructed, X_tensor)  # Reconstruction Loss
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}/{epochs}, Loss: {loss.item():.4f}")


Epoch 0/100, Loss: 0.2338
Epoch 10/100, Loss: 0.2003
Epoch 20/100, Loss: 0.1667
Epoch 30/100, Loss: 0.1258
Epoch 40/100, Loss: 0.1161
Epoch 50/100, Loss: 0.1150
Epoch 60/100, Loss: 0.1140
Epoch 70/100, Loss: 0.1134
Epoch 80/100, Loss: 0.1130
Epoch 90/100, Loss: 0.1126


## K-Means Clustering

In [10]:
# Extract decoded uputput as K-means input
X = reconstructed.detach().numpy()

In [11]:
#Evaluate how many clusters based on shiloutte score

# candidate values for our number of cluster
parameters = list(range(2,11))

# instantiating ParameterGrid, pass number of clusters as input
parameter_grid = ParameterGrid({'n_clusters': parameters})
best_score = -1
kmeans_model = KMeans()     # instantiating KMeans model
silhouette_scores = []

# evaluation based on silhouette_score
for p in parameter_grid:
    kmeans_model.set_params(**p)    # set current hyper parameter
    kmeans_model.fit(X)          # fit model on data, this will find clusters based on parameter p
    ss = silhouette_score(X, kmeans_model.labels_)   # calculate silhouette_score
    silhouette_scores += [ss]       # store all the scores
    print('Parameter:', p, 'Score', ss)
    # check p which has the best score
    if ss > best_score:
        best_score = ss
        best_grid = p
        
# plotting silhouette score
plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#722f59', width=0.5)
plt.xticks(range(len(silhouette_scores)), list(parameters))
plt.title('Silhouette Score', fontweight='bold')
plt.xlabel('Number of Clusters')
plt.show()

Parameter: {'n_clusters': 2} Score 0.5411161
Parameter: {'n_clusters': 3} Score 0.49826562
Parameter: {'n_clusters': 4} Score 0.47284678
Parameter: {'n_clusters': 5} Score 0.45496875
Parameter: {'n_clusters': 6} Score 0.4351118
Parameter: {'n_clusters': 7} Score 0.42000082
Parameter: {'n_clusters': 8} Score 0.40172637
Parameter: {'n_clusters': 9} Score 0.3861573
Parameter: {'n_clusters': 10} Score 0.3704249


NameError: name 'plt' is not defined

In [14]:
# Apply K-Means
km = KMeans(n_clusters=3)
y_km = km.fit_predict(X)