Project by:
- Jack Chen 4427737
- Joost Litjes 4540700
- Felicia Hung 7568479

In [2]:
import numpy as np
import pandas as pd

import os

import sklearn

from scipy import stats

import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

In [3]:
px.defaults.width = 600
px.defaults.height = 600

Task 1

In [4]:
db = pd.read_csv("online_shoppers_intention 1.csv")
# db = pd.read_csv("online_shoppers_intention 2.csv")

In [5]:
db.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [6]:
def exportImage(plot, name):
    pio.write_html(plot, os.path.join("plots", name + '.html'))
    
    # Change if you want to print plots !!
    # fig.show()

In [7]:
numeric_features = [
    "Administrative",
    "Informational",
    "ProductRelated",
    "Administrative_Duration",
    "Informational_Duration",
    "ProductRelated_Duration",
    "BounceRates",
    "PageValues",
    "SpecialDay",
]

categorical_features = [
    "TrafficType",
    "VisitorType",
    "OperatingSystems",
    "Browser",
    "Region",
    "Month",
    "Weekend",
    "Revenue",
]

In [8]:
db = db.astype({col: str for col in db.columns if col in categorical_features})
browser_13_df = db[db['Browser'] == "13"]
other_browsers_df = db[db['Browser'] != "13"]

In [9]:
fig = make_subplots(rows=len(numeric_features), cols=2,
                    subplot_titles=('Browser 13', 'Other Browsers'))

colors = ['blue', 'red']  # Define colors for the two databases

for j, feature_to_plot in enumerate(numeric_features):
    for i, database in enumerate([browser_13_df, other_browsers_df]):
        data = database[feature_to_plot]
        # Only show the feature name once in the subplot titles
        title = feature_to_plot if i == 0 else ''
        box_trace = go.Box(x=data, name=title, marker_color=colors[i], showlegend=False)
        fig.add_trace(box_trace, row=j+1, col=i+1)

fig.update_layout(height=len(numeric_features)*100, width=800, title_text="Comparing trends between Browser 13 and others for Numeric Features")
exportImage(fig, "Comparing trends between Browser 13 and others for Numeric Features")


In [10]:
fig = make_subplots(rows=len(categorical_features), cols=2,
                    subplot_titles=('Browser 13', 'Other Browsers'))

colors = ['blue', 'red']

for i, database in enumerate([browser_13_df, other_browsers_df]):
    for j, feature_to_plot in enumerate(categorical_features):
        data = database[feature_to_plot]
        unique_values = data.unique()
        box_trace = go.Bar(x=data.value_counts(), y=unique_values, text="", marker_color=colors[i], showlegend=False, orientation='h')
        fig.add_trace(box_trace, row=j+1, col=i+1)

for j, feature_to_plot in enumerate(categorical_features):
    fig.update_yaxes(title_text=feature_to_plot, row=j+1, col=1)

fig.update_layout(height=len(numeric_features)*150, width=1000, title_text="Comparing trends between Browser 13 and others for Categorical Features")
exportImage(fig, "Comparing trends between Browser 13 and others for Categorical Features")

Task 2

In [11]:
# Manual normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

for column in numeric_features:
    db[column] = normalize_column(db[column])


Task 3

In [12]:
# Convert categorical features to numerical using one-hot encoding
data_encoded = pd.get_dummies(db, columns=categorical_features, drop_first=True)

# Calculate the correlation matrix for dummified categorical features
correlation_matrix_categorical = list(data_encoded[
    data_encoded.columns.difference(numeric_features)
].columns)

correlation_matrix = data_encoded[numeric_features + correlation_matrix_categorical].corr()

numeric_features_indexes = [correlation_matrix.columns.get_loc(col) for col in numeric_features]
categorical_features_indexes = [correlation_matrix.columns.get_loc(col) for col in correlation_matrix_categorical]

data = correlation_matrix.iloc[numeric_features_indexes, numeric_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Numeric Features", color="Correlation"),
    title="Correlation Heatmap of Numerical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Numerical Features")

data = correlation_matrix.iloc[categorical_features_indexes, categorical_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Categorical Features", y="Categorical Features", color="Correlation"),
    title="Correlation Heatmap of Categorical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Categorical Features")

data = correlation_matrix.iloc[numeric_features_indexes, categorical_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Categorical Features", color="Correlation"),
    title="Correlation Heatmap of Numerical vs Categorical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Numerical vs Categorical Features")


In [13]:
fig = make_subplots(rows=len(numeric_features), cols=len(numeric_features))

for i, feature_to_plot_y in enumerate(numeric_features):
    for j, feature_to_plot_x in enumerate(numeric_features):
        trace = go.Scatter(x=db[feature_to_plot_x], y=db[feature_to_plot_y], text="", mode='markers', showlegend=True)
        fig.add_trace(trace, row=j+1, col=i+1)

# Add x and y labels to the subplots
for i, feature in enumerate(numeric_features):
    fig.update_xaxes(title_text=feature, row=len(numeric_features), col=i+1)
    fig.update_yaxes(title_text=feature, row=i+1, col=1)

fig.update_layout(height=len(numeric_features)*150, width=len(numeric_features)*150, title_text="Comparing trends between Browser 13 and others for Categorical Features")
exportImage(fig, "Scatter Plots")

In [14]:
from sklearn.decomposition import PCA
data = data_encoded

pca = PCA(n_components=2)
components = pca.fit_transform(data)
components_df = pd.DataFrame(components, columns=['PC1', 'PC2'])
fig = px.scatter(components_df, x='PC1', y='PC2')
fig.show()

exportImage(fig, "PCA")


In [15]:
from sklearn.cluster import DBSCAN, Birch, AffinityPropagation

def perform_clustering(data, clustering_algo, cluster_col_name, title, export_name, **kwargs):
    clusterer = clustering_algo(**kwargs)
    data[cluster_col_name] = clusterer.fit_predict(data)
    fig = px.scatter(data, x="PC1", y="PC2", color=cluster_col_name, title=title)
    fig.show()
    exportImage(fig, export_name)
    return data

In [16]:
# Example usage for DBSCAN clustering
components_df = perform_clustering(components_df, DBSCAN, "DBSCAN_Cluster", "DBSCAN Clustering", "DBSCAN Clustering", eps=0.1, min_samples=5)

In [17]:
# Example usage for Birch clustering
components_df = perform_clustering(components_df, Birch, "Birch_Cluster", "Birch Clustering", "Birch Clustering", threshold=0.5, branching_factor=100, n_clusters=4)

In [18]:
# Example usage for Affinity Propagation clustering
components_df = perform_clustering(components_df, AffinityPropagation, "AP_Cluster", "Affinity Propagation Clustering", "Affinity Propagation Clustering", damping=0.95, max_iter=50, convergence_iter=5)

Task 4

In [20]:
import numpy as np
from sklearn.metrics import pairwise_distances

def silhouette_score(data, labels):
    distances = pairwise_distances(data)
    n = len(data)
    silhouette_values = np.zeros(n)

    for i in range(n):
        cluster_i = labels[i]
        a_i = np.mean(distances[i][labels == cluster_i])

        b_i = np.min([np.mean(distances[i][labels == cluster_j]) for cluster_j in np.unique(labels) if cluster_j != cluster_i])

        silhouette_values[i] = (b_i - a_i) / max(a_i, b_i)

    silhouette_avg = np.mean(silhouette_values)
    return silhouette_avg

from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

In [26]:
x = len(data)
data = components_df.iloc[:x, :2].to_numpy()

# Create a DataFrame and assign scores
results = pd.DataFrame({
    'Model': ['AP', 'DBSCAN', 'Birch'],
    'Silhouette Score': [
        silhouette_score(data, components_df[:x]['AP_Cluster'].to_numpy()),
        silhouette_score(data, components_df[:x]['DBSCAN_Cluster'].to_numpy()),
        silhouette_score(data, components_df[:x]['Birch_Cluster'].to_numpy())
    ],
    'Davies-Bouldin Score': [
        davies_bouldin_score(data, components_df[:x]['AP_Cluster'].to_numpy()),
        davies_bouldin_score(data, components_df[:x]['DBSCAN_Cluster'].to_numpy()),
        davies_bouldin_score(data, components_df[:x]['Birch_Cluster'].to_numpy())
    ],
    'Calinski-Harabasz Score': [
        calinski_harabasz_score(data, components_df[:x]['AP_Cluster'].to_numpy()),
        calinski_harabasz_score(data, components_df[:x]['DBSCAN_Cluster'].to_numpy()),
        calinski_harabasz_score(data, components_df[:x]['Birch_Cluster'].to_numpy())
    ]
})

# Display the results in a neat tabular format
print(results)


    Model  Silhouette Score  Davies-Bouldin Score  Calinski-Harabasz Score
0      AP          0.483332              0.575331              1128.672656
1  DBSCAN          0.445668              0.568138              1112.113017
2   Birch          0.152017              0.853131               876.654713


In [None]:

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))
    
def manhattan_distance(x1, x2):
    # Calculate the Manhattan distance between two points
    distance = 0
    for i in range(len(x1)):
        distance += abs(x1[i] - x2[i])
    return distance

def cosine_similarity_distance(x1, x2):
    dot_product = np.dot(x1, x2)
    norm_x1 = np.linalg.norm(x1)
    norm_x2 = np.linalg.norm(x2)
    
    if norm_x1 == 0 or norm_x2 == 0:
        return 1.0  # Handle division by zero
    
    return 1 - (dot_product / (norm_x1 * norm_x2))

In [27]:
components_df = perform_clustering(components_df, DBSCAN, "DBSCAN_Cluster_Custom_Euclidean_Distance", "Custom_Euclidean_Distance Clustering", "Custom_Euclidean_Distance Clustering", eps=0.1, min_samples=5, metric=euclidean_distance)
components_df = perform_clustering(components_df, DBSCAN, "DBSCAN_Cluster_Custom_Manhattan_Distance", "Custom_Manhattan_Distance Clustering", "Custom_Manhattan_Distance Clustering", eps=0.1, min_samples=5, metric=manhattan_distance)
components_df = perform_clustering(components_df, DBSCAN, "DBSCAN_Cluster_Custom_Cosine_Similarity_Distance", "Custom_Cosine_Similarity_Distance Clustering", "Custom_Cosine_Similarity_Distance Clustering", eps=0.1, min_samples=5, metric=cosine_similarity_distance)

x = len(data)
data = components_df.iloc[:x, :2].to_numpy()

results = pd.DataFrame({
    'Model': ['euclidean', 'manhattan_distance', 'cosine_similarity_distance'],
    'Silhouette Score': [
        silhouette_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Euclidean_Distance'].to_numpy()),
        silhouette_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Manhattan_Distance'].to_numpy()),
        silhouette_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Cosine_Similarity_Distance'].to_numpy())
    ],
    'Davies-Bouldin Score': [
        davies_bouldin_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Euclidean_Distance'].to_numpy()),
        davies_bouldin_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Manhattan_Distance'].to_numpy()),
        davies_bouldin_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Cosine_Similarity_Distance'].to_numpy())
    ],
    'Calinski-Harabasz Score': [
        calinski_harabasz_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Euclidean_Distance'].to_numpy()),
        calinski_harabasz_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Manhattan_Distance'].to_numpy()),
        calinski_harabasz_score(data, components_df[:x]['DBSCAN_Cluster_Custom_Cosine_Similarity_Distance'].to_numpy())
    ]
})

# Display the results in a neat tabular format
print(results)

ValueError: Metric 'cosine_similarity' not valid. Use sorted(sklearn.neighbors.VALID_METRICS['brute']) to get valid options. Metric can also be a callable function.