In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

In [2]:
# Dataset loading in pandas dataframe -> scaled & encoded data
dataset = pd.read_csv("lab_courses_2015.csv", sep=';')

In [3]:
# Checking data shape
row, col = dataset.shape
print(f'There are {row} rows and {col} columns') 
print(dataset.head(10))    

There are 216006 rows and 6 columns
   component  action  target  contextid  contextlevel  contextinstanceid
0          5      10       5     105728            50               1089
1         17      10       6     105844            70              67635
2          5      10       5     105728            50               1089
3          9      10      12     105734            70              67525
4          5      10       5     106176            50               1094
5          5      10       5     106176            50               1094
6          5      10       5     106176            50               1094
7         18      10       6     110766            70              70458
8          5      10       5     105728            50               1089
9         17      10       6     105844            70              67635


In [4]:
# To work on copy of the data
dataset_scaled = dataset.copy()

In [5]:
# Scaling the data to keep the different attributes in same range.
dataset_scaled[dataset_scaled.columns] = StandardScaler().fit_transform(dataset_scaled)
print(dataset_scaled.describe())

          component        action        target     contextid  contextlevel  \
count  2.160060e+05  2.160060e+05  2.160060e+05  2.160060e+05  2.160060e+05   
mean   4.825240e-13 -8.526593e-14 -4.700615e-14 -2.279710e-13  4.432454e-13   
std    1.000002e+00  1.000002e+00  1.000002e+00  1.000002e+00  1.000002e+00   
min   -1.634216e+00 -8.719580e+00 -1.488072e+00 -1.180162e+00 -1.083591e+00   
25%   -8.036008e-01  1.932715e-01 -3.624239e-01 -1.180162e+00 -1.083591e+00   
50%    2.701443e-02  1.932715e-01 -3.624239e-01  2.389723e-01  9.228571e-01   
75%    2.701443e-02  1.932715e-01  2.004002e-01  1.235776e+00  9.228571e-01   
max    2.103553e+00  1.932715e-01  2.733109e+00  1.442257e+00  9.228571e-01   

       contextinstanceid  
count       2.160060e+05  
mean       -2.905845e-13  
std         1.000002e+00  
min        -9.725989e-01  
25%        -9.725989e-01  
50%        -1.993837e-01  
75%         7.680869e-01  
max         1.670854e+00  


In [6]:
#1. Loading dataset
dataset_scaled.head(10)

Unnamed: 0,component,action,target,contextid,contextlevel,contextinstanceid
0,-0.803601,0.193271,-0.362424,1.235776,-1.083591,-0.933161
1,1.688245,0.193271,-0.081012,1.238426,0.922857,1.479027
2,-0.803601,0.193271,-0.362424,1.235776,-1.083591,-0.933161
3,0.027014,0.193271,1.607461,1.235913,0.922857,1.47504
4,-0.803601,0.193271,-0.362424,1.246013,-1.083591,-0.932979
5,-0.803601,0.193271,-0.362424,1.246013,-1.083591,-0.932979
6,-0.803601,0.193271,-0.362424,1.246013,-1.083591,-0.932979
7,1.895899,0.193271,-0.081012,1.350899,0.922857,1.581357
8,-0.803601,0.193271,-0.362424,1.235776,-1.083591,-0.933161
9,1.688245,0.193271,-0.081012,1.238426,0.922857,1.479027


In [7]:
# Reduce the dimensions of dataset with use of Principal Component Analysis (PCA).
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(dataset_scaled)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(
np.sum(pca_2.explained_variance_ratio_)))

Explained variation per principal component: [0.54952458 0.20794695]
Cumulative variance explained by 2 principal components: 75.75%


In [8]:
# Results from pca.components_
dataset_pca = pd.DataFrame(abs(pca_2.components_), columns=dataset_scaled.columns, index=['PC_1', 'PC_2'])
print('\n\n', dataset_pca)
    
print("\n*************** Most important features *************************")
print('As per PC 1:\n', (dataset_pca[dataset_pca > 0.3].iloc[0]).dropna())
print('\n\nAs per PC 2:\n', (dataset_pca[dataset_pca > 0.3].iloc[1]).dropna())
print("\n******************************************************************")

#return pca_2_result, pca_2



       component    action   target  contextid  contextlevel  contextinstanceid
PC_1   0.472440  0.093180  0.04890   0.470343      0.509519           0.533756
PC_2   0.317474  0.535157  0.75652   0.056370      0.192524           0.015838

*************** Most important features *************************
As per PC 1:
 component            0.472440
contextid            0.470343
contextlevel         0.509519
contextinstanceid    0.533756
Name: PC_1, dtype: float64


As per PC 2:
 component    0.317474
action       0.535157
target       0.756520
Name: PC_2, dtype: float64

******************************************************************


In [9]:
print(pca_2_result)

[[-0.88433664 -0.38598677]
 [ 2.61776072 -0.61625546]
 [-0.88433664 -0.38598677]
 ...
 [-2.04170584 -0.24917628]
 [ 0.5647941  -0.11993718]
 [ 1.40339115  6.57040537]]


In [10]:
print(pca_2)

PCA(n_components=2)


In [11]:
# Fitting KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(dataset_scaled)
cluster_map = pd.DataFrame()
cluster_map['data_index'] = dataset_scaled.index.values
cluster_map['cluster'] = kmeans.labels_
centroids = kmeans.cluster_centers_
centroids_pca = pca_2.transform(centroids)



In [12]:
cluster_map[cluster_map.cluster == 0]

Unnamed: 0,data_index,cluster
0,0,0
2,2,0
4,4,0
5,5,0
6,6,0
...,...,...
215997,215997,0
215998,215998,0
215999,215999,0
216002,216002,0


In [13]:
cluster_map[cluster_map.cluster == 0]

Unnamed: 0,data_index,cluster
0,0,0
2,2,0
4,4,0
5,5,0
6,6,0
...,...,...
215997,215997,0
215998,215998,0
215999,215999,0
216002,216002,0


In [None]:
cluster_map[cluster_map.cluster == 2]

In [None]:
cluster_map[cluster_map.cluster == 3]

In [None]:
# Visualizing the clusters
 # ------------------ Using Matplotlib for plotting-----------------------
x = pca_2_result[:, 0]
y = pca_2_result[:, 1] 

label = kmeans.labels_
        
plt.figure(figsize=(14,7), dpi=1500)
plt.scatter(x, y, c=label, alpha=0.6, s=200, edgecolors="black")  # plot different colors per cluster
plt.title('Student clusters')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='X', s=200, linewidths=1.5,color='red', edgecolors="black", lw=1.5)

plt.show()
plt.savefig('clusters.png')

In [None]:
# Nice Pythonic way to get the indices of the points for each corresponding cluster
mydict = {i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)}

# Transform this dictionary into list (if you need a list as result)
dictlist = []
for key, value in mydict.items():
    temp = [key,value]
    dictlist.append(temp)
    
print (*dictlist[0], sep="\n")

In [None]:
import pandas
pd = pandas.DataFrame(dictlist[0])
pd.to_csv("mylist1.csv")

In [None]:
print(*dictlist[0])