## Import Modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.max_columns=50

In [None]:
import os

for dirname, _, filenames in os.walk("."):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("turkiye-student-evaluation_generic.csv")
df.head()

In [None]:
# statistical info
df.describe()

In [None]:
# datatype info
df.info()

## Preprocessing the dataset

In [None]:
# check for null values
df.isnull().sum()

## Exploratory Data Analysis

In [None]:
# find mean of questions
x_questions = df.iloc[:, 5:33]
q_mean = x_questions.mean(axis=0)
total_mean = q_mean.mean()

In [None]:
q_mean = q_mean.to_frame('mean')
q_mean.reset_index(level=0, inplace=True)
q_mean.head()

In [None]:
total_mean

In [None]:
plt.figure(figsize=(14, 7))
sns.barplot(x='index', y='mean', data=q_mean)

## Correlation Matrix

In [None]:
corr = df.corr()
plt.figure(figsize=(18, 18))
sns.heatmap(corr, annot=True, cmap='coolwarm')

## Principal component analysis

In [None]:
X = df.iloc[:, 5:33]

from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X)

X_pca

In [None]:
# How much info we retained from the dataset
pca.explained_variance_ratio_.cumsum()[1]

## Model Training

In [None]:
from sklearn.cluster import KMeans
distortions = []
cluster_range = range(1, 6)

# elbow method
for i in cluster_range:
    model = KMeans(n_clusters=i, init='k-means++', random_state=42)
    model.fit(X_pca)
    distortions.append(model.inertia_)

plt.plot(cluster_range, distortions, marker='o')
plt.xlabel("Number of clusters")
plt.ylabel("Distortions")
plt.show()

In [None]:
# use best cluster
model = KMeans(n_clusters=3, init='k-means++', random_state=42)
model.fit(X_pca)
y = model.predict(X_pca)

In [None]:
plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], s=50, c='red', label='cluster 1')
plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], s=50, c='yellow', label='cluster 2')
plt.scatter(X_pca[y==2, 0], X_pca[y==2, 1], s=50, c='green', label='cluster 3')
plt.scatter(model.cluster_centers_[:,0], model.cluster_centers_[:, 1], s=100, c='blue', label='centroids')
plt.title('Cluster of students')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend()
plt.show()

In [None]:
from collections import Counter
Counter(y)

model = KMeans(n_clusters=3, init='k-means++', random_state=42)
model.fit(X)
y = model.predict(X)

Counter(y)

In [None]:
# dendrogram
import scipy.cluster.hierarchy as hier
dendogram = hier.dendrogram(hier.linkage(X_pca, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Questions')
plt.ylabel('Distance')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=2, linkage='ward')
y = model.fit_predict(X_pca)

plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], s=50, c='red', label='cluster 1')
plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], s=50, c='yellow', label='cluster 2')
plt.title('Cluster of students')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend()
plt.show()

In [None]:
Counter(y)