features:imd_band, age_band, edu, adjusted_mark

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler


# load the data
data = pd.read_csv('data/combined-data.csv')

In [None]:
data.head()
# data.shape

In [None]:
# pre-processing the columns
# change age_band
data["age_band"] = data["age_band"].map({"0-35":1, "35-55":2, "55<=":3})

# preprocess "edu" column
data["edu"] = data["edu"] .map({"Level or lower":0, "HE or higher":1})

# change imd_band 
data["imd_band"] = data["imd_band"].map({"01/10/2020":'', "0-10%":0, "10-20%":1, "20-30%":2, "30-40%":3, "40-50%":4, "50-60%":5, "60-70%":6, "70-80%":7, "80-90%":8, "90-100%":9})

In [None]:
pd.DataFrame(data).to_csv('processed-data.csv')

In [None]:
data.dropna(inplace=True)
data.shape

In [None]:
pd.DataFrame(data).to_csv('dropnull-data.csv')

In [None]:
# extract the four features, 'imd_band', 'age_band', 'edu','adjusted_mark'
X = data.iloc[:, [5, 9, 10, 15]]
# X.shape
X.head()

In [None]:
# find the optimal value of k
see = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X)
    see.append(kmeans.inertia_)
plt.plot(range(1, 11), see)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared error')
# plt.show()
plt.savefig('image/elbow.png')


In [None]:
# k value is 3
km=KMeans(n_clusters=3)
print(km)

In [None]:
# preprocessing using min max scaler
#  'imd_band', 'age_band', 'edu','adjusted_mark'

scaler= MinMaxScaler()

scaler.fit(data[['imd_band']])
data['imd_band'] = scaler.transform(data[['imd_band']])

scaler.fit(data[['age_band']])
data['age_band'] = scaler.transform(data[['age_band']])

scaler.fit(data[['edu']])
data['edu'] = scaler.transform(data[['edu']])

scaler.fit(data[['adjusted_mark']])
data['adjusted_mark'] = scaler.transform(data[['adjusted_mark']])

In [None]:
pd.DataFrame(data).to_csv('minmaxscaled-data.csv')

In [None]:
# create a scatter plot with multiple x variables
fig, ax = plt.subplots()
ax.scatter(data['imd_band'], data['adjusted_mark'], color='blue', label='the Index of Multiple Deprivation')
ax.scatter(data['age_band'], data['adjusted_mark'], color='red', label='Age')
ax.scatter(data['edu'], data['adjusted_mark'], color='yellow', label='Previous Education')
ax.legend()
ax.set_xlabel('X')
ax.set_ylabel('final mark')
plt.show()

In [None]:
# extract the four features, 'imd_band', 'age_band', 'edu','adjusted_mark'
predict_data = data.iloc[:, [5, 9, 10, 15]]
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(predict_data)
y_predicted

In [None]:
data['cluster'] = y_predicted
data.head()

In [None]:
pd.DataFrame(data).to_csv('clustered-data.csv')