Project by:
- Jack Chen 4427737
- Joost Litjes 4540700
- Felicia Hung 7568479

In [1]:
import numpy as np
import pandas as pd

import os

import sklearn

from scipy import stats

import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

In [2]:
px.defaults.width = 600
px.defaults.height = 600

Task 1

In [3]:
db = pd.read_csv("online_shoppers_intention 1.csv")
# db = pd.read_csv("online_shoppers_intention 2.csv")

In [4]:
db.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0
mean,1.787278,66.413076,0.40727,28.974482,19.938455,814.960064,0.025455,0.048373,3.888318,0.048658,2.073523,2.292441,2.999587,3.304007
std,2.776484,161.804541,1.192336,116.740632,23.71276,1162.622651,0.056032,0.054975,13.813926,0.176078,0.767876,1.423671,2.324768,3.109054
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000176,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,5.0,127.0,0.0,0.013636,0.0,0.0,2.0,2.0,1.0,1.0
50%,0.0,0.0,0.0,0.0,13.0,424.0,0.0,0.026389,0.0,0.0,2.0,2.0,2.0,2.0
75%,3.0,68.0,0.0,0.0,27.0,1071.611111,0.016667,0.055119,0.0,0.0,2.0,2.0,4.0,3.0
max,24.0,2047.234848,16.0,1636.0,328.0,13158.66667,0.2,0.2,261.491286,1.0,7.0,10.0,9.0,18.0


In [5]:
def exportImage(plot, name):
    pio.write_html(plot, os.path.join("plots", name + '.html'))
    
    # Change if you want to print plots !!
    # fig.show()

In [6]:
numeric_features = [
    "Administrative",
    "Informational",
    "ProductRelated",
    "Administrative_Duration",
    "Informational_Duration",
    "ProductRelated_Duration",
    "BounceRates",
    "PageValues",
    "SpecialDay",
]

categorical_features = [
    "TrafficType",
    "VisitorType",
    "OperatingSystems",
    "Browser",
    "Region",
    "Month",
    "Weekend",
    "Revenue",
]

In [7]:
db = db.astype({col: str for col in db.columns if col in categorical_features})
browser_13_df = db[db['Browser'] == "13"]
other_browsers_df = db[db['Browser'] != "13"]

In [8]:
fig = make_subplots(rows=len(numeric_features), cols=2,
                    subplot_titles=('Browser 13', 'Other Browsers'))

colors = ['blue', 'red']  # Define colors for the two databases

for j, feature_to_plot in enumerate(numeric_features):
    for i, database in enumerate([browser_13_df, other_browsers_df]):
        data = database[feature_to_plot]
        # Only show the feature name once in the subplot titles
        title = feature_to_plot if i == 0 else ''
        box_trace = go.Box(x=data, name=title, marker_color=colors[i], showlegend=False)
        fig.add_trace(box_trace, row=j+1, col=i+1)

fig.update_layout(height=len(numeric_features)*100, width=800, title_text="Comparing trends between Browser 13 and others for Numeric Features")
exportImage(fig, "Comparing trends between Browser 13 and others for Numeric Features")


In [9]:
fig = make_subplots(rows=len(categorical_features), cols=2,
                    subplot_titles=('Browser 13', 'Other Browsers'))

colors = ['blue', 'red']

for i, database in enumerate([browser_13_df, other_browsers_df]):
    for j, feature_to_plot in enumerate(categorical_features):
        data = database[feature_to_plot]
        unique_values = data.unique()
        box_trace = go.Bar(x=data.value_counts(), y=unique_values, text="", marker_color=colors[i], showlegend=False, orientation='h')
        fig.add_trace(box_trace, row=j+1, col=i+1)

for j, feature_to_plot in enumerate(categorical_features):
    fig.update_yaxes(title_text=feature_to_plot, row=j+1, col=1)

fig.update_layout(height=len(numeric_features)*150, width=1000, title_text="Comparing trends between Browser 13 and others for Categorical Features")
exportImage(fig, "Comparing trends between Browser 13 and others for Categorical Features")

Task 2

In [10]:
# Manual normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

for column in numeric_features:
    db[column] = normalize_column(db[column])


Task 3

In [11]:
# Convert categorical features to numerical using one-hot encoding
data_encoded = pd.get_dummies(db, columns=categorical_features, drop_first=True)

# Calculate the correlation matrix for dummified categorical features
correlation_matrix_categorical = list(data_encoded[
    data_encoded.columns.difference(numeric_features)
].columns)

correlation_matrix = data_encoded[numeric_features + correlation_matrix_categorical].corr()

numeric_features_indexes = [correlation_matrix.columns.get_loc(col) for col in numeric_features]
categorical_features_indexes = [correlation_matrix.columns.get_loc(col) for col in correlation_matrix_categorical]

data = correlation_matrix.iloc[numeric_features_indexes, numeric_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Numeric Features", color="Correlation"),
    title="Correlation Heatmap of Numerical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Numerical Features")

data = correlation_matrix.iloc[categorical_features_indexes, categorical_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Categorical Features", y="Categorical Features", color="Correlation"),
    title="Correlation Heatmap of Categorical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Categorical Features")

data = correlation_matrix.iloc[numeric_features_indexes, categorical_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Categorical Features", color="Correlation"),
    title="Correlation Heatmap of Numerical vs Categorical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Numerical vs Categorical Features")


In [12]:
fig = make_subplots(rows=len(numeric_features), cols=len(numeric_features))

for i, feature_to_plot_y in enumerate(numeric_features):
    for j, feature_to_plot_x in enumerate(numeric_features):
        trace = go.Scatter(x=db[feature_to_plot_x], y=db[feature_to_plot_y], text="", mode='markers', showlegend=True)
        fig.add_trace(trace, row=j+1, col=i+1)

# Add x and y labels to the subplots
for i, feature in enumerate(numeric_features):
    fig.update_xaxes(title_text=feature, row=len(numeric_features), col=i+1)
    fig.update_yaxes(title_text=feature, row=i+1, col=1)

fig.update_layout(height=len(numeric_features)*150, width=len(numeric_features)*150, title_text="Comparing trends between Browser 13 and others for Categorical Features")
exportImage(fig, "Scatter Plots")

In [13]:
from sklearn.decomposition import PCA
data = data_encoded

pca = PCA(n_components=2)
components = pca.fit_transform(data)
components_df = pd.DataFrame(components, columns=['PC1', 'PC2'])
fig = px.scatter(components_df, x='PC1', y='PC2')
fig.show()

exportImage(fig, "PCA")


EVERYTHING BELOW DOES NOT WORK YET

In [14]:
from sklearn.cluster import DBSCAN

# Apply DBSCAN clustering
dbscan = None
components_df['DBSCAN_Cluster'] = [0] * len(components_df)

dbscan = DBSCAN(eps=0.1, min_samples=5)
components_df['DBSCAN_Cluster'] = dbscan.fit_predict(components_df)
fig = px.scatter(components_df, x="PC1", y="PC2", color="DBSCAN_Cluster",
                        title="DBSCAN Clustering")
fig.show()
exportImage(fig, "DBSCAN Clustering")

In [15]:
from sklearn.cluster import Birch

# Apply Birch clustering
birch = Birch(threshold=0.5, branching_factor=100, n_clusters=4)
components_df['Birch_Cluster'] = birch.fit_predict(components_df)
fig = px.scatter(components_df, x="PC1", y="PC2", color="Birch_Cluster",
                       title="Birch Clustering")
fig.show()
exportImage(fig, "Birch Clustering")

In [23]:
from sklearn.cluster import AffinityPropagation

# Apply Affinity Propagation clustering
affinity_propagation = AffinityPropagation(damping=0.85, max_iter=50, convergence_iter=5)
components_df['AP_Cluster'] = affinity_propagation.fit_predict(components_df)
fig = px.scatter(components_df, x="PC1", y="PC2", color="AP_Cluster",
                    title="Affinity Propagation Clustering")
fig.show()
exportImage(fig, "Affinity Propagation Clustering")

Task 4