Project by:
- Jack Chen 4427737
- Joost Litjes 4540700
- Felicia Hung 7568479

In [49]:
import numpy as np
import pandas as pd

import os

import sklearn

from scipy import stats

import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

In [50]:
px.defaults.width = 600
px.defaults.height = 600

Task 1

In [51]:
db = pd.read_csv("online_shoppers_intention 1.csv")

In [52]:
db.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [53]:
def exportImage(plot, name):
    pio.write_html(plot, os.path.join("plots", name + '.html'))
    
    # Change if you want to print plots !!
    # fig.show()

In [54]:
numeric_features = [
    "Administrative",
    "Informational",
    "ProductRelated",
    "Administrative_Duration",
    "Informational_Duration",
    "ProductRelated_Duration",
    "BounceRates",
    "PageValues",
    "SpecialDay",
]

categorical_features = [
    "TrafficType",
    "VisitorType",
    "OperatingSystems",
    "Browser",
    "Region",
    "Month",
    "Weekend",
    "Revenue",
]

In [55]:
db = db.astype({col: str for col in db.columns if col in categorical_features})
browser_13_df = db[db['Browser'] == "13"]
other_browsers_df = db[db['Browser'] != "13"]

In [56]:
fig = make_subplots(rows=len(numeric_features), cols=2,
                    subplot_titles=('Browser 13', 'Other Browsers'))

colors = ['blue', 'red']  # Define colors for the two databases

for j, feature_to_plot in enumerate(numeric_features):
    for i, database in enumerate([browser_13_df, other_browsers_df]):
        data = database[feature_to_plot]
        # Only show the feature name once in the subplot titles
        title = feature_to_plot if i == 0 else ''
        box_trace = go.Box(x=data, name=title, marker_color=colors[i], showlegend=False)
        fig.add_trace(box_trace, row=j+1, col=i+1)

fig.update_layout(height=len(numeric_features)*100, width=800, title_text="Comparing trends between Browser 13 and others for Numeric Features")
exportImage(fig, "Comparing trends between Browser 13 and others for Numeric Features")


In [57]:
fig = make_subplots(rows=len(categorical_features), cols=2,
                    subplot_titles=('Browser 13', 'Other Browsers'))

colors = ['blue', 'red']

for i, database in enumerate([browser_13_df, other_browsers_df]):
    for j, feature_to_plot in enumerate(categorical_features):
        data = database[feature_to_plot]
        unique_values = data.unique()
        box_trace = go.Bar(x=data.value_counts(), y=unique_values, text="", marker_color=colors[i], showlegend=False, orientation='h')
        fig.add_trace(box_trace, row=j+1, col=i+1)

for j, feature_to_plot in enumerate(categorical_features):
    fig.update_yaxes(title_text=feature_to_plot, row=j+1, col=1)

fig.update_layout(height=len(numeric_features)*150, width=1000, title_text="Comparing trends between Browser 13 and others for Categorical Features")
exportImage(fig, "Comparing trends between Browser 13 and others for Categorical Features")

Task 2

In [58]:
# Manual normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

for column in numeric_features:
    db[column] = normalize_column(db[column])


Task 3

In [59]:
# Convert categorical features to numerical using one-hot encoding
data_encoded = pd.get_dummies(db, columns=categorical_features, drop_first=True)

# Calculate the correlation matrix for dummified categorical features
correlation_matrix_categorical = list(data_encoded[
    data_encoded.columns.difference(numeric_features)
].columns)

correlation_matrix = data_encoded[numeric_features + correlation_matrix_categorical].corr()

numeric_features_indexes = [correlation_matrix.columns.get_loc(col) for col in numeric_features]
categorical_features_indexes = [correlation_matrix.columns.get_loc(col) for col in correlation_matrix_categorical]

data = correlation_matrix.iloc[numeric_features_indexes, numeric_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Numeric Features", color="Correlation"),
    title="Correlation Heatmap of Numerical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Numerical Features")

data = correlation_matrix.iloc[categorical_features_indexes, categorical_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Categorical Features", y="Categorical Features", color="Correlation"),
    title="Correlation Heatmap of Categorical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Categorical Features")

data = correlation_matrix.iloc[numeric_features_indexes, categorical_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Categorical Features", color="Correlation"),
    title="Correlation Heatmap of Numerical vs Categorical Features",
)
fig.update_layout(height=1000, width=1000)
exportImage(fig, "Correlation Heatmap of Numerical vs Categorical Features")


In [60]:
fig = make_subplots(rows=len(numeric_features), cols=len(numeric_features))

for i, feature_to_plot_y in enumerate(numeric_features):
    for j, feature_to_plot_x in enumerate(numeric_features):
        trace = go.Scatter(x=db[feature_to_plot_x], y=db[feature_to_plot_y], text="", mode='markers', showlegend=True)
        fig.add_trace(trace, row=j+1, col=i+1)

# Add x and y labels to the subplots
for i, feature in enumerate(numeric_features):
    fig.update_xaxes(title_text=feature, row=len(numeric_features), col=i+1)
    fig.update_yaxes(title_text=feature, row=i+1, col=1)

fig.update_layout(height=len(numeric_features)*150, width=len(numeric_features)*150, title_text="Comparing trends between Browser 13 and others for Categorical Features")
exportImage(fig, "Scatter Plots")

In [61]:
from sklearn.decomposition import PCA
data = data_encoded

pca = PCA(n_components=2)
components = pca.fit_transform(data)
components_df = pd.DataFrame(components, columns=['PC1', 'PC2'])
fig = px.scatter(components_df, x='PC1', y='PC2')
fig.show()

exportImage(fig, "PCA")


EVERYTHING BELOW DOES NOT WORK YET

In [67]:
from sklearn.cluster import DBSCAN

# Apply DBSCAN clustering
dbscan = DBSCAN()
components_df['DBSCAN_Cluster'] = dbscan.fit_predict(components_df).astype(str)
fig = px.scatter(components_df, x="PC1", y="PC2", color="DBSCAN_Cluster",
                        title="DBSCAN Clustering")
fig.show()
exportImage(fig, "DBSCAN Clustering")

print(components_df)

            PC1       PC2 DBSCAN_Cluster Birch_Cluster
0      0.580498  0.000467              0             0
1     -0.746700  0.106194              1             1
2      0.762210 -0.154231              0             0
3      0.828345 -0.087503              0             0
4      1.121296  0.201166              0             0
...         ...       ...            ...           ...
12325  0.582406  0.362459              0             0
12326  0.759976  0.712991              0             0
12327  0.852729  0.609251              0             0
12328 -0.502736  0.082688              1             1
12329  0.368428  1.300671              0             0

[12330 rows x 4 columns]


In [63]:
from sklearn.cluster import Birch

# Apply Birch clustering
birch = Birch()
components_df['Birch_Cluster'] = birch.fit_predict(components_df).astype(str)
fig = px.scatter(components_df, x="PC1", y="PC2", color="Birch_Cluster",
                       title="Birch Clustering")
fig.show()
exportImage(fig, "Birch Clustering")

In [64]:
break()

SyntaxError: invalid syntax (759819533.py, line 1)

In [None]:
from sklearn.cluster import AffinityPropagation

# Apply Affinity Propagation clustering
affinity_propagation = AffinityPropagation()
components_df['AP_Cluster'] = affinity_propagation.fit_predict(components_df).astype(str)
fig = px.scatter(components_df, x="PC1", y="PC2", color="AP_Cluster",
                    title="Affinity Propagation Clustering")
fig.show()
exportImage(fig, "Affinity Propagation Clustering")

: 

Task 4