In [3]:
import pandas as pd
import plotly as pl
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.cluster import AffinityPropagation, DBSCAN, Birch
import numpy as np
import math
from collections import Counter
import seaborn as sns
from sklearn.decomposition import PCA
DATA = pd.read_csv("online_shoppers_intention.csv")


Task 1: Get data on screen
Goal: Make visualizations that help understand the data

In [None]:
# New- vs. Returning customers pie chart
fig = px.pie(DATA, names='VisitorType', title='Types of Customers')
fig.show()



# DURATION BOX PLOTTED
# Product related has a couple (<10) extreme outliers, had to filter those out
threshold = 15000 

filtered_data = DATA[
    (DATA['Administrative_Duration'] <= threshold) &
    (DATA['Informational_Duration'] <= threshold) &
    (DATA['ProductRelated_Duration'] <= threshold)
]

fig = px.box(filtered_data, y=['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration'])
fig.show()


# BOUNCERATES VS EXITRATES scatter
fig = px.scatter(DATA, x="BounceRates", y="ExitRates")
fig.show()


## PAGEVALUES PER MONTH
monthly_average = DATA.groupby('Month')['PageValues'].mean().reset_index()
# Create the line chart
fig = px.line(monthly_average, y="PageValues", x="Month", title="Average Page Values by Month")
fig.show()


# EXITRATES PER TRAFFICTYPE
fig = px.bar(DATA, x="TrafficType", y="ExitRates",
             color='TrafficType', barmode='stack',
             height=400)
fig.show()

purchase_counts = DATA[DATA['Revenue']].groupby('Month').size().reset_index(name='Count')

fig = px.bar(purchase_counts, x='Month', y='Count', title='Count of Purchases by Month')
fig.show()

# PURCHASES CLOSE TO IMPORTANT DATES
SD_purchase_data = DATA[DATA['Revenue']]

SD_purchase_counts = SD_purchase_data['SpecialDay'].value_counts().reset_index()
SD_purchase_counts.columns = ['SpecialDay', 'Count']

SD_purchase_counts = SD_purchase_counts.sort_values('SpecialDay')

fig = px.line(SD_purchase_counts, x='SpecialDay', y='Count', title='Count of Purchases by Days to Special Day')

fig.update_layout(xaxis=dict( autorange="reversed"))

fig.show()

In [5]:

# cleaning
cleaned = DATA.dropna()
if len(DATA)-len(cleaned) == 0:
    print("There are no missing values in the dataset")

# candidates for one-hot-encoding (OHE)
featuresOHN = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType']
#data[featuresOHN] = data[featuresOHN].astype(str)
transformer = make_column_transformer((OneHotEncoder(categories='auto', sparse_output=False), featuresOHN), remainder='passthrough')
transformed = transformer.fit_transform(DATA)
dataOHN = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# clean column names for readability
flist = dataOHN.columns.to_list()
newflist = [col.replace('remainder__', '') if col.startswith('remainder__') else col.replace('onehotencoder__', '') for col in flist]
dataOHN.columns = newflist

# data normalization
featuresMinMax = ['BounceRates', 'ExitRates']
featuresZscore = ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'PageValues']

for f in featuresMinMax:
    dataOHN[f] = (dataOHN[f] - dataOHN[f].min()) /(dataOHN[f].max() - dataOHN[f].min())

for f in featuresZscore:
    dataOHN[f] = dataOHN[f].astype(float)
    dataOHN[f] = np.log1p(dataOHN[f])




There are no missing values in the dataset


Task 3: Clustering Algorithms
Goal: Succesfully use clustering algorithms

In [5]:
# Affinity Propagation, different blocks to test each clustering algorithm separately
affinity_propagation = AffinityPropagation(random_state=27)
affinity_propagation.fit(dataOHN)

# Affinity Propagation information
print(affinity_propagation.cluster_centers_)
print(affinity_propagation.labels_)
print(affinity_propagation.n_iter_)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[ 39  32 322 ... 146 338 277]
125


In [17]:
#DBSCAN, different blocks to test each clustering algorithm separately
dbscan = DBSCAN(eps=0.75, min_samples=10)
dbscan.fit(dataOHN)



In [6]:
# BIRCH, different blocks to test each clustering algorithm separately
birch = Birch(threshold=0.75, n_clusters=None)
birch.fit(dataOHN)

Task 4: Evaluation of clustering methods
Goal: Be able to evaluate clustering methods using different evaluation measures and reason on their performance

In [None]:
#############################
# Task 4.1: Silhoutte score #
#############################

# a(i) = 1 / |Ci| - 1 * sum d(i,j), 1 divided by the amount in the cluster times the sum of all distances
# between i and all other values in the cluster

#affinity_labels = affinity_propagation.labels_

birch_labels = birch.labels_


dbscan_labels = dbscan.labels_
print(birch_labels)





# Function to calculate distances to the current cluster
def distances_to_cluster(point_index, data, all_labels, current_label):
    # Get the indices of all points in the same cluster
    cluster_indices = np.where(all_labels == current_label)[0]
    
    point = data.iloc[point_index].values
    
    cluster_data = data.iloc[cluster_indices].values

    distances = np.linalg.norm(cluster_data - point, axis=1)
    
    return distances


# Silhouette score calculation
def calc_silhoutte_score(labels):
    silhouette_scores = []
    
    for id, point in dataOHN.iterrows():
        
        # Calculating a(i) - mean intra-cluster distance
        current_label = labels[id]
        cluster_indices = np.where(labels == current_label)[0]
        cluster_size = len(cluster_indices)
        
        
        if cluster_size < 2:
            silhouette_scores.append(0)
            continue
        
        # Sum of distances within the same cluster
        sum_of_distances = distances_to_cluster(id, dataOHN, labels, current_label)
        current_ai = (1 / (cluster_size - 1)) * np.sum(sum_of_distances)
        
        list_of_bi = []
        
        # Calculating b(i) - minimum average distance to other clusters
        for other_label in np.unique(labels):
            if other_label != current_label:
                other_cluster_indices = np.where(labels == other_label)[0]
                other_cluster_size = len(other_cluster_indices)
                
                if other_cluster_size == 0:
                    continue
                
                # Distance to points in other clusters
                other_cluster_dists = distances_to_cluster(id, dataOHN, labels, other_label)
                current_bi = np.mean(other_cluster_dists)
                list_of_bi.append(current_bi)
        
        if list_of_bi:
            bi = min(list_of_bi)
        else:
            bi = float('inf')  
        
        # Calculating the silhouette score for the current point
        if max(current_ai, bi) > 0:
            silhouette_score = (bi - current_ai) / max(current_ai, bi)
        else:
            silhouette_score = 0
        
        silhouette_scores.append(silhouette_score)
    
    # Return the mean silhouette score, ignore NaNs if present
    return np.nanmean(silhouette_scores)


# affin_sil_score = calc_silhoutte_score(affinity_labels)
birch_sil_score = calc_silhoutte_score(birch_labels)
#print(birch_sil_score)
#dbscan_sil_score = calc_silhoutte_score(dbscan_labels)
#print(dbscan_sil_score)





        







In [25]:
# SILLHOUETTE SCORE

birch_labels = birch.labels_

first_1000_rows = dataOHN[:10]


def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1 - row2) ** 2))


def silhouette_score(labels):

    unique_clusters = np.unique(labels)
    num_clusters = len(unique_clusters)

    for index, row in first_1000_rows.iterrows():
        print("ploink")
        current_cluster = birch_labels[index]
        Ci = list(labels).count(current_cluster)

        own_clust_dist = 0

        for index2, row2 in first_1000_rows.iterrows():
            if birch_labels[index2] == current_cluster and index2 != index:
                dist = euclidean_distance(row,row2)
                own_clust_dist += dist
        
        if Ci > 1:
            Ai = (1 / (Ci - 1)) * own_clust_dist
        else:
            Ai = 0

        min_distance = float('inf')  # Initialize minimum distance as infinity
        
        for j in range(num_clusters - 1):
            other_clust = unique_clusters[j]
            if other_clust != current_cluster:
                distance_sum = 0
                for index3,row3 in first_1000_rows.iterrows():
                    if labels[index3] == other_clust:
                        distance_sum = euclidean_distance(row, row3)
                average_dist = distance_sum / list(labels).count(other_clust)
                if average_dist < min_distance:
                    min_distance = average_dist
                    neighboring_cluster = unique_clusters[j]
        Bi = min_distance
        
        if Ci > 1 and Bi != 0 and Ai != 0:
            Si = (Bi - Ai) / max(Bi,Ai)
        else:
            Si = 0
        
    return Si
         

birch_sil_score = silhouette_score(birch_labels)
print(birch_sil_score)




ploink
ploink
ploink
ploink
ploink
ploink
ploink
ploink
ploink
ploink
0


In [None]:
# 5.1: Manual Euclidian Distance
def manual_euclidean_distance(p1, p2):

    distance = 0.0
    
    for i in range(len(p1)):
        distance += (p1[i] - p2[i]) ** 2
    
    return distance ** 0.5


# 5.2: Manual Manhatten Distance
def manual_manhattan_distance(p1, p2):
    distance = 0.0
    
    for i in range(len(p1)):
        distance += abs(p1[i] - p2[i])
    
    return distance


# 5.3: Manual 
def cosine_similarity_distance(p1, p2):
    dot_product = sum(a * b for a, b in zip(p1, p2))
    magnitude_p1 = math.sqrt(sum(a**2 for a in p1))
    magnitude_p2 = math.sqrt(sum(b**2 for b in p2))
    
    if magnitude_p1 == 0 or magnitude_p1 == 0:
        return 0  
    
    cos_similarity = dot_product / (magnitude_p1 * magnitude_p2)

    return 1 - cos_similarity

