# Prüfungsleitung Maschnielles Lernen: Gruppe 8

Isolet download: https://datahub.io/machine-learning/isolet

### Aufgabe 1

#### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from skfeature.function.similarity_based import fisher_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [2]:
# import
df_raw_isolet = pd.read_csv("./raw_isolet.csv", delimiter=",")

# changing column name 'class' to 'letter' 'cause class is a reservated keyword in python 
df_raw_isolet.rename(columns={'class': 'letter'}, inplace=True)
# removing char ' from letter
df_raw_isolet['letter'] = df_raw_isolet['letter'].replace({'\'': ''}, regex=True)

X = df_raw_isolet.iloc[:, :617]
y = df_raw_isolet.iloc[:, 617]

##### Einfache Datenbetrachtung

In [3]:
print("Einträge in isolet.csv", len(df_raw_isolet))
print("Gruppierte Einträge:", df_raw_isolet.groupby(['letter']).size())

Einträge in isolet.csv 7797
Gruppierte Einträge: letter
1     300
10    300
11    300
12    300
13    299
14    300
15    300
16    300
17    300
18    300
19    300
2     300
20    300
21    300
22    300
23    300
24    300
25    300
26    300
3     300
4     300
5     300
6     298
7     300
8     300
9     300
dtype: int64


Bei der Betrachtung der Anzahl ist aufällig, dass in der Regel pro Buchstabe 300 Einträge aufgezeichnet wurden. Da aber len(df) nur 7797 Einträge beinhaltet, gibt es einige Ausnahmen. <br>
Ausnahmen: Buchstabe 13 mit 299 Einträgen und Buchstabe 6 mit 298 Einträgen <br><br>
Insgesamt bietet die sehr ausgewogene Verteilung der jeweiligen Buchstaben eine sehr gute Basis dies in dem Data Splitting zu betrachten. So wird für jeden Buchstaben ein Testsatz von je 75 Einträge bestimmt. Formel: ⎡Anzahl Einträge pro Buchstabe * 0,25⎤

##### Data Splitting

In [4]:
# df_training = df_raw_isolet.copy()
# df_test = pd.DataFrame(data=None, columns=df_raw_isolet.columns)
# numberOfTestEntries = 75

# for i in range(26):
#     df_test = pd.concat([df_test, df_training[df_training.letter.str.match("^" + str(i+1) + "$")].head(numberOfTestEntries)])
# df_training.drop(df_test.index, inplace=True)

# # check
# print("Number of rows in df_raw_isolet:", len(df_raw_isolet))
# print("Number of rows in df_training:", len(df_training))
# print("Number of rows in df_test", len(df_test))
# if len(df_training) + len(df_test) == len(df_raw_isolet):
#     print("OK: Check passed!")
# else: 
#     print("WARNING: The number of lines do not match! Diffrence:", len(df_training) + len(df_test) - len(df_raw_isolet))    


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.25, shuffle=False)

#### Merkmalstransformation - Principal Component Analysis

In [6]:
pca = PCA(n_components=100)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)

#### Merkmalsselektion - Fisher Score

In [7]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

rank_idx = fisher_score.fisher_score(X_train, y_train, mode='rank')

In [8]:
num_features = 100
selected_features_train = X_train[:, rank_idx[:num_features]]
selected_features_test = X_test[:, rank_idx[:num_features]]

#### Random Forest Classifier - Principal Component Analysis

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

#Create a Gaussian Classifier
rfc_pca = RandomForestClassifier()
rfc_pca.fit(X_train_pca, y_train)

y_pred_rfc_pca = rfc_pca.predict(pca.transform(X_test))

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rfc_pca))

Accuracy: 0.9061538461538462


#### Random Forest Classifier - Fisher Score

In [10]:
#Create a Gaussian Classifier
rfc_fs = RandomForestClassifier()
rfc_fs.fit(selected_features_train, y_train)

y_pred_rfc_fs = rfc_fs.predict(X_test[:, rank_idx[:num_features]])

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rfc_fs))

Accuracy: 0.8635897435897436


#### k-Means Clustering - Principal Component Analysis

In [11]:
kmeans_pca = KMeans(n_clusters=26, random_state=42)
kmeans_pca.fit(X_train_pca)
y_pred_kmeans_pca = kmeans_pca.predict(pca.transform(X_test))

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test.astype(np.int0), y_pred_kmeans_pca))

Accuracy: 0.04051282051282051


#### k-Means Clustering - Fisher Score

In [12]:
kmeans_fs = KMeans(n_clusters=26, random_state=42)
kmeans_fs.fit(selected_features_train)
y_pred_kmeans_fs = kmeans_pca.predict(X_test[:, rank_idx[:num_features]])

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test.astype(np.int0), y_pred_kmeans_fs))

Accuracy: 0.038974358974358976


#### 3-fold cv - Random Forest Classifier

In [13]:
scores = []

score = cross_val_score(RandomForestClassifier(), X_train_pca, y_train, cv=3)
scores.append(score)

In [14]:
score = cross_val_score(RandomForestClassifier(), selected_features_train, y_train, cv=3)
scores.append(score)

#### 3-fold cv - k-Means Clustering

In [15]:
score = cross_val_score(KMeans(n_clusters=26, random_state=42), X_train_pca, y_train, cv=3)
scores.append(score)

In [16]:
score = cross_val_score(KMeans(n_clusters=26, random_state=42), selected_features_train, y_train, cv=3)
scores.append(score)

In [17]:
print(f'Accuracy mean of Random Forest with Principal Component Analysis: {np.mean(scores[0])}')
print(f'Accuracy mean of Random Forest with Fisher Score: {np.mean(scores[1])}')
print(f'Accuracy mean of k-Means Clustering with Principal Component Analysis: {np.mean(scores[2])}')
print(f'Accuracy mean of k-Means Clustering with Fisher Score: {np.mean(scores[3])}')

Accuracy mean of Random Forest with Principal Component Analysis: 0.9047374722079699
Accuracy mean of Random Forest with Fisher Score: 0.86198050282196
Accuracy mean of k-Means Clustering with Principal Component Analysis: -93065.45257559053
Accuracy mean of k-Means Clustering with Fisher Score: -17828.58741055869


#### 3) Implementierung der Klassifikatoren

#### 4) Evaluation