In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc

In [None]:
# load data
data = pd.read_csv('data/combineVle.csv')
data.shape

In [None]:
data.rename(columns={'total_score*weight':'total_score'}, inplace=True)
data.head()

In [None]:
# process data, change string to integer
# change age_band
data["age_band"] = data["age_band"].map({"0-35":1, "35-55":2, "55<=":3})

# preprocess "edu" column
data["highest_education"] = data["highest_education"] .map({"No Formal quals":0, "Lower Than A Level":1, "A Level or Equivalent":2, "HE Qualification":3, "Post Graduate Qualification":4})

# change imd_band 
data["imd_band"] = data["imd_band"].map({"0-10%":0, "10-20%":1, "20-30%":2, "30-40%":3, "40-50%":4, "50-60%":5, "60-70%":6, "70-80%":7, "80-90%":8, "90-100%":9})

# change module_presentation
change={"AAA_2013J":1, "AAA_2014J":2, "BBB_2013B":3, "BBB_2013J":4, "BBB_2014B":5, "BBB_2014J":6, "CCC_2014B":7, "CCC_2014J":8,\
        "DDD_2013B":9, "DDD_2013J":10, "DDD_2014B":11, "DDD_2014J":12, "EEE_2013J":13, "EEE_2014B":14, "EEE_2014J":15,\
        "FFF_2013B":16, "FFF_2013J":17, "FFF_2014B":18, "FFF_2014J":19, "GGG_2013J":20, "GGG_2014B":21, "GGG_2014J":22}
data["module_presentation"] = data["module_presentation"].map(change)

# change final_result, don't need to transfer
# data["final_result"] = data["final_result"].map({"Withdrawn":0,  "Fail":0,  "Pass":1, "Distinction":1})
data.shape

# factors are ['imd_band', 'age_band', 'highest_education', 'module_presentation', 'total_click']
# target will be 'total_score'. When total score is greater than 4000 which indicates that 'final_result' is pass or distinction

In [None]:
# Prepare the data
x = data['total_score']  # 'total_score' column as x-axis
y = data[['imd_band', 'age_band', 'highest_education', 'module_presentation', 'total_click']]  # Combination of five factor columns as y-axis

# Create a scatter plot
plt.scatter(x, y)

# Set plot title and labels
plt.title('K-means Clustering')
plt.xlabel('Final Result')
plt.ylabel('Factors')

# Show the plot
plt.show()


In [None]:
# K-Means clustering
km = KMeans(n_clusters=3,init = 'k-means++')
y_predicted = km.fit_predict(extracted_data)
y_predicted