# Prüfungsleitung Maschnielles Lernen: Gruppe 8

Isolet download: https://datahub.io/machine-learning/isolet

### Aufgabe 1

#### Imports

In [86]:
# Import isolet.csv File
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from skfeature.function.similarity_based import fisher_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [52]:
# import
df_raw_isolet = pd.read_csv("./raw_isolet.csv", delimiter=",")

# changing column name 'class' to 'letter' 'cause class is a reservated keyword in python 
df_raw_isolet.rename(columns={'class': 'letter'}, inplace=True)
# removing char ' from letter
df_raw_isolet['letter'] = df_raw_isolet['letter'].replace({'\'': ''}, regex=True)

df_raw_isolet_subset = df_raw_isolet.iloc[:2000]

X_subset = df_raw_isolet_subset.iloc[:, :617]
y_subset = df_raw_isolet_subset.iloc[:, 617]

X = df_raw_isolet.iloc[:, :617]
y = df_raw_isolet.iloc[:, 617]

##### Einfache Datenbetrachtung

In [53]:
print("Einträge in isolet.csv", len(df_raw_isolet))
print("Gruppierte Einträge:", df_raw_isolet.groupby(['letter']).size())

Einträge in isolet.csv 7797
Gruppierte Einträge: letter
1     300
10    300
11    300
12    300
13    299
14    300
15    300
16    300
17    300
18    300
19    300
2     300
20    300
21    300
22    300
23    300
24    300
25    300
26    300
3     300
4     300
5     300
6     298
7     300
8     300
9     300
dtype: int64


Bei der Betrachtung der Anzahl ist aufällig, dass in der Regel pro Buchstabe 300 Einträge aufgezeichnet wurden. Da aber len(df) nur 7797 Einträge beinhaltet, gibt es einige Ausnahmen. <br>
Ausnahmen: Buchstabe 13 mit 299 Einträgen und Buchstabe 6 mit 298 Einträgen <br><br>
Insgesamt bietet die sehr ausgewogene Verteilung der jeweiligen Buchstaben eine sehr gute Basis dies in dem Data Splitting zu betrachten. So wird für jeden Buchstaben ein Testsatz von je 75 Einträge bestimmt. Formel: ⎡Anzahl Einträge pro Buchstabe * 0,25⎤

##### Data Splitting

In [54]:
df_training = df_raw_isolet.copy()
df_test = pd.DataFrame(data=None, columns=df_raw_isolet.columns)
numberOfTestEntries = 75

for i in range(26):
    df_test = pd.concat([df_test, df_training[df_training.letter.str.match("^" + str(i+1) + "$")].head(numberOfTestEntries)])
df_training.drop(df_test.index, inplace=True)

# check
print("Number of rows in df_raw_isolet:", len(df_raw_isolet))
print("Number of rows in df_training:", len(df_training))
print("Number of rows in df_test", len(df_test))
if len(df_training) + len(df_test) == len(df_raw_isolet):
    print("OK: Check passed!")
else: 
    print("WARNING: The number of lines do not match! Diffrence:", len(df_training) + len(df_test) - len(df_raw_isolet))    


Number of rows in df_raw_isolet: 7797
Number of rows in df_training: 5847
Number of rows in df_test 1950
OK: Check passed!


In [55]:
# X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=.25, shuffle=False)

X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, random_state=42, test_size=.25, shuffle=False)

In [56]:
# heatmap can not be pushed to git. calculate it on your local machine

# plt.figure(figsize=(150, 100))
# fig = sns.heatmap(X_train.corr(), annot=True, cmap=sns.cm.rocket_r)

#### Merkmalstransformation - Principal Component Analysis

In [102]:
pca = PCA(n_components=100)
pca.fit(X_train)

#### Merkmalsselektion - Fisher Score

In [58]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_test = X_test.to_numpy()
y_test = y_test.to_numpy()


rank_idx = fisher_score.fisher_score(X_train, y_train, mode='rank')
rank_idx

# n_samples, n_features = np.shape(X_train)

# print(n_samples, n_features)

# # print(np.shape(X_train))
# print(np.shape(y_train))
# X_train.to_numpy()

array([199, 198, 200, 222, 201, 145, 512, 513, 481, 223, 482, 545, 221,
       511, 514,  33, 228, 480, 543, 544, 155, 227, 450, 546, 510, 322,
       229, 202, 479, 577, 321, 542, 449, 509, 451, 578, 483, 478, 576,
       224, 156, 575, 320, 226, 609, 607, 157, 515, 158, 508, 401, 608,
       159, 160, 164, 161, 402, 162, 163, 225, 203, 319, 448, 477, 541,
       442,  69, 547, 443, 610,  68, 444, 445, 507, 418, 441, 476, 433,
       102, 323, 474, 447, 579, 432, 101, 446, 146, 411, 606, 475, 400,
       574,  67, 611, 431, 412, 540, 506, 318, 434, 473, 413, 100, 419,
       417, 352, 220, 351, 399, 230, 410, 354, 353, 430, 440, 416, 539,
       403, 505,  99, 605, 386, 409,  70, 573, 383, 350, 355, 384, 317,
       133, 538, 191, 387, 537, 204, 385, 463, 414, 315, 314, 464, 147,
       465, 316, 462, 472, 205, 132, 572, 232, 398, 382, 313, 415, 233,
       255, 134, 536, 429, 439, 206, 570, 569, 504, 231, 131, 571, 604,
       148, 169, 435,  66,  26,  98, 466, 568, 192,  27, 103, 23

In [59]:
num_features = 100
selected_features_train = X_subset[:, rank_idx[:num_features]]
selected_features_test = X_test[:, rank_idx[:num_features]]

print(selected_features_train)

[[-0.381  -0.4842 -0.639  ...  0.1694  0.114  -0.9714]
 [-0.4308 -0.4414 -0.4628 ...  0.5732  0.4666 -1.    ]
 [-0.4552 -0.5484 -0.8028 ...  0.7552  0.6666 -0.9142]
 ...
 [ 0.4374  0.5838  0.133  ...  0.5468  0.5834 -0.6572]
 [-0.4526 -0.2548 -0.2884 ...  0.0748  0.1956 -1.    ]
 [-0.088  -0.3368 -0.1244 ...  0.803   0.34   -1.    ]]


#### K-Means Clustering

In [78]:
kmeans = KMeans(n_clusters=26, random_state=42).fit(X.to_numpy())
print(kmeans.labels_)
print(kmeans.cluster_centers_)

[ 7 25 20 ... 15 12 12]
[[-0.4086649   0.09462252  0.40381854 ...  0.16115894  0.05166093
  -0.3559404 ]
 [-0.3159136   0.22703867  0.42915227 ...  0.13672508  0.04475287
  -0.32242719]
 [-0.518315   -0.05362111  0.10804944 ...  0.72556611  0.63081444
   0.29487667]
 ...
 [-0.24440303  0.28253636  0.58490909 ...  0.19349242  0.03912273
  -0.34466061]
 [-0.08386934  0.47734745  0.46792847 ...  0.18338613  0.0646854
  -0.36468248]
 [-0.46768812  0.03074554  0.25454752 ...  0.09820693 -0.00815149
  -0.34132871]]


#### Random Forest Classifier

In [107]:
scores = []

score = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=3)
scores.append(score)

In [108]:
score = cross_val_score(RandomForestClassifier(), selected_features_train, y_train, cv=3)
scores.append(score)

In [109]:
score = cross_val_score(RandomForestClassifier(), pca.transform(X_train), y_train, cv=3)
scores.append(score)

In [110]:
scores

[array([0.878, 0.936, 0.942]),
 array([0.792, 0.81 , 0.874]),
 array([0.86 , 0.896, 0.912])]

#### 3) Implementierung der Klassifikatoren

#### 4) Evaluation