## Standardize the item response data

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
# https://scikit-learn.org/stable/auto_examples/classification/plot_lda_qda.html#sphx-glr-auto-examples-classification-plot-lda-qda-py

from sklearn.metrics import confusion_matrix

from funcs import *
from clustering import *
from matplotlib import pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# sort 
def sort_labels(labels, data):
    num_cluster = np.max(labels) + 1
    sorted_index = np.argsort([np.mean(np.mean(data[labels == i] ,axis=0)) for i in range(num_cluster)])[::-1]
    sorted_labels = sort_categories_by_mean(labels, sorted_index)
    
    return sorted_labels

In [3]:
data = pd.read_excel('data/pretest_data.xlsx', index_col= '번호')

In [4]:
response_data = np.array(data.drop(columns=['성적/10']))

In [5]:
data_standardized = (response_data - np.mean(response_data, axis=0)) / np.std(response_data, axis=0)

In [6]:
items_for_each_component = {'Repetition':[1],'Probability':[2,3],'Conditional_Probablity':[4,5],
                            'Random_Variable':[6,7],'Probability_Distribution':[8,9],'Statistical_Estimation':[10]}

In [7]:
# plt.imshow(response_data, cmap='gray')
#data_standardized.shape

## Initialize using PCA & k-means

In [8]:
# Number of clusters
num_clusters = 4

itr = 0
all_labels = []

fit_pca = PCA(n_components=7, random_state=0).fit_transform(data_standardized)
print(fit_pca.shape)

# clustering by k-means 
km_pca = KMeans(n_clusters=num_clusters, random_state=0).fit(fit_pca)

labels = sort_labels(km_pca.labels_, data_standardized)
print(labels.shape)

all_labels.append(labels)


(95, 7)
(95,)


## update by iterating LDA and k-means

In [9]:
max_itr = 5

lda = LinearDiscriminantAnalysis()
km = KMeans(n_clusters=num_clusters, random_state=0)

for itr in range(1, max_itr+1):
    # 1. fit LDA with original (hig-dim) data and current label
    lda.fit(data_standardized, labels)

#     labels_lda = sort_labels(lda.predict(data_standardized), data_standardized)
    
#     # compare labels
#     cm = confusion_matrix(labels, labels_lda)
#     print(cm)

    # 2. project using LDA
    U = lda.coef_
#     print(U.shape)

    # normalize U
    norm = np.sqrt(np.sum(U ** 2,axis=1)).reshape((-1,1))
    U = U / norm

    projected = np.dot(data_standardized, U.T)
#     print(projected.shape)

    # 3. k-means clustering
    km_lda = km.fit(projected)

    labels_lda_km = sort_labels(km_lda.labels_, data_standardized)

    # compare labels
    cm = confusion_matrix(labels, labels_lda_km)
    print(cm)

    labels = labels_lda_km

[[26  0  1  0]
 [ 1 16  0  0]
 [ 0  0 19  1]
 [ 0  2  1 28]]
[[24  0  1  2]
 [ 0 17  0  1]
 [ 0  0 21  0]
 [ 0  0  0 29]]
[[ 6 18  0  0]
 [ 5  0 12  0]
 [ 0 22  0  0]
 [ 0  0  0 32]]
[[11  0  0  0]
 [ 0 40  0  0]
 [ 0  0 12  0]
 [ 0  0  0 32]]
[[11  0  0  0]
 [ 0 40  0  0]
 [ 0  0 12  0]
 [ 0  0  0 32]]


## [TODO] Visualize with `labels`