In [20]:
import pandas as pd
from sklearn.cluster import KMeans, Birch, AgglomerativeClustering
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.model_selection import train_test_split

In [21]:
df = pd.read_csv(r'C:\Users\marku\Desktop\ML\MLGit\datasets\airline.csv')

# EDA in Oblig1 Clustering notebook

In [22]:
df = df.drop(['Unnamed: 0', 'id', 'Flight Distance', 'Departure Delay in Minutes'], axis=1)
df['Gender'] = df['Gender'].replace(['Female', 'Male'], [0,1])
df['Type of Travel'] = df['Type of Travel'].replace(['Personal Travel', 'Business travel'], [0,1])
df['Class'] = df['Class'].replace(['Eco Plus', 'Business', 'Eco'], [0,1, 2])
df['Customer Type'] = df['Customer Type'].replace(['disloyal Customer', 'Loyal Customer'], [0,1])
df['satisfaction'] = df['satisfaction'].replace(['neutral or dissatisfied', 'satisfied'], [0,1])

def handle_null_median(df):
    # Need to set inplace=True, so it doesn't create a copy of the dataframe. Tried without and this led to null-values not being removed
    df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median(), inplace=True)

    return  df
df_unlabeled = df.drop(['satisfaction', 'Gate location', 'Departure/Arrival time convenient'], axis=1)
df_unlabeled = handle_null_median(df_unlabeled)
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Arrival Delay in Minutes,satisfaction
0,1,1,13,0,0,3,4,3,1,5,...,5,5,4,3,4,4,5,5,18.0,0
1,1,0,25,1,1,3,2,3,3,1,...,1,1,1,5,3,1,4,1,6.0,0
2,0,1,26,1,1,2,2,2,2,5,...,5,5,4,3,4,4,4,5,0.0,1
3,0,1,25,1,1,2,5,5,5,2,...,2,2,2,5,3,1,4,2,9.0,0
4,1,1,61,1,1,3,3,3,3,4,...,5,3,3,4,4,3,3,3,0.0,1


In [24]:
train_X, test_X, train_y, test_y = train_test_split(df_unlabeled, df['satisfaction'], random_state=42, test_size=0.25)

# Kmeans

In [37]:
kmeans = KMeans(n_clusters=3)
kmeans_cluster = kmeans.fit(train_X)
prediction = kmeans_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.6407188307005522

In [38]:
predict = kmeans.predict(test_X)
accuracy_score(test_y, predict)

0.526793963658762

# KMEANS TUNING

In [32]:
kmeans = KMeans(n_clusters=4)
kmeans_cluster = kmeans.fit(train_X)
prediction = kmeans_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.40577437634271696

Moving up from 3 clusters resulted in extremely poor accuracy. Will therefor test out 2

In [33]:
kmeans = KMeans(n_clusters=2)
kmeans_cluster = kmeans.fit(train_X)
prediction = kmeans_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.776568653602399

Since this is originally a classification dataset, I know 2 clusters will be the best. From this point on, I will start each model with 2 clusters.

In [34]:
predict = kmeans.predict(test_X)
accuracy_score(test_y, predict)

0.5512781028641823

# BIRCH

In [40]:
birch = Birch(n_clusters=2)
birch_cluster = birch.fit(train_X)
prediction = birch_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.7999767833907544

In [42]:
predict = birch.predict(test_X)
accuracy_score(test_y, predict)

0.5570526639975362

# BIRCH TUNING

In [73]:
birch = Birch(n_clusters=2, branching_factor=60)
birch_cluster = birch.fit(train_X)
prediction = birch_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.734814752055152

In [74]:
birch = Birch(n_clusters=2, branching_factor=100)
birch_cluster = birch.fit(train_X)
prediction = birch_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.7691711173890805

In [75]:
birch = Birch(n_clusters=2, branching_factor=150)
birch_cluster = birch.fit(train_X)
prediction = birch_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.7821382998018902

In [43]:
birch = Birch(n_clusters=2, branching_factor=200)
birch_cluster = birch.fit(train_X)
prediction = birch_cluster.predict(test_X)
silhouette_score(test_X, prediction)

0.796192176614458

In [44]:
predict = birch.predict(test_X)
accuracy_score(test_y, predict)

0.5564367108099785

# Agglomerative

In [81]:
agglomerative = AgglomerativeClustering(n_clusters=2)
agglomerative_cluster = agglomerative.fit(df_unlabeled)
silhouette_score(df_unlabeled, agglomerative.labels_)

0.8093158154120401

I could not find any parameters worth tuning