In [7]:
import pandas as pd
import plotly as pl
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.cluster import AffinityPropagation, DBSCAN, Birch
import numpy as np
DATA = pd.read_csv("online_shoppers_intention.csv")

Task 1: Get data on screen
Goal: Make visualizations that help understand the data

In [None]:
# New- vs. Returning customers pie chart
fig = px.pie(DATA, names='VisitorType', title='Types of Customers')
fig.show()



# DURATION BOX PLOTTED
# Product related has a couple (<10) extreme outliers, had to filter those out
threshold = 15000 

filtered_data = DATA[
    (DATA['Administrative_Duration'] <= threshold) &
    (DATA['Informational_Duration'] <= threshold) &
    (DATA['ProductRelated_Duration'] <= threshold)
]

fig = px.box(filtered_data, y=['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration'])
fig.show()


# BOUNCERATES VS EXITRATES scatter
fig = px.scatter(DATA, x="BounceRates", y="ExitRates")
fig.show()


## PAGEVALUES PER MONTH
monthly_average = DATA.groupby('Month')['PageValues'].mean().reset_index()
# Create the line chart
fig = px.line(monthly_average, y="PageValues", x="Month", title="Average Page Values by Month")
fig.show()


# EXITRATES PER TRAFFICTYPE
fig = px.bar(DATA, x="TrafficType", y="ExitRates",
             color='TrafficType', barmode='stack',
             height=400)
fig.show()

purchase_counts = DATA[DATA['Revenue']].groupby('Month').size().reset_index(name='Count')

fig = px.bar(purchase_counts, x='Month', y='Count', title='Count of Purchases by Month')
fig.show()

# PURCHASES CLOSE TO IMPORTANT DATES
SD_purchase_data = DATA[DATA['Revenue']]

SD_purchase_counts = SD_purchase_data['SpecialDay'].value_counts().reset_index()
SD_purchase_counts.columns = ['SpecialDay', 'Count']

SD_purchase_counts = SD_purchase_counts.sort_values('SpecialDay')

fig = px.line(SD_purchase_counts, x='SpecialDay', y='Count', title='Count of Purchases by Days to Special Day')

fig.update_layout(xaxis=dict( autorange="reversed"))

fig.show()

In [9]:

# cleaning
cleaned = DATA.dropna()
if len(DATA)-len(cleaned) == 0:
    print("There are no missing values in the dataset")

# candidates for one-hot-encoding (OHE)
featuresOHN = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType']
#data[featuresOHN] = data[featuresOHN].astype(str)
transformer = make_column_transformer((OneHotEncoder(categories='auto', sparse_output=False), featuresOHN), remainder='passthrough')
transformed = transformer.fit_transform(DATA)
dataOHN = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# clean column names for readability
flist = dataOHN.columns.to_list()
newflist = [col.replace('remainder__', '') if col.startswith('remainder__') else col.replace('onehotencoder__', '') for col in flist]
dataOHN.columns = newflist

# data normalization
featuresMinMax = ['BounceRates', 'ExitRates']
featuresZscore = ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'PageValues']

for f in featuresMinMax:
    dataOHN[f] = (dataOHN[f] - dataOHN[f].min()) /(dataOHN[f].max() - dataOHN[f].min())

for f in featuresZscore:
    dataOHN[f] = dataOHN[f].astype(float)
    dataOHN[f] = np.log1p(dataOHN[f])




There are no missing values in the dataset


Task 3: Clustering Algorithms
Goal: Succesfully use clustering algorithms

In [None]:
# Affinity Propagation, different blocks to test each clustering algorithm separately
affinity_propagation = AffinityPropagation(random_state=42)
affinity_propagation.fit(dataOHN)


In [15]:
#DBSCAN, different blocks to test each clustering algorithm separately
dbscan = DBSCAN(eps=0.75, min_samples=10)
dbscan.fit(dataOHN)


In [None]:
# BIRCH, different blocks to test each clustering algorithm separately
birch = Birch(threshold=0.75, n_clusters=None)
birch.fit(dataOHN)