In [1]:
from Features import FeatureColumns
from TrainTestData import loadData
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

import numpy as np
import pandas as pd
import multiprocessing

In [2]:
X_train, X_test, y_train, y_test, data, feature_columns, categorical_features, target_column = loadData()
print(data)

                      Dates       Category  \
299931  2005-09-23 03:22:00  DRUG/NARCOTIC   
6964    2015-03-04 18:30:00  LARCENY/THEFT   
231301  2007-11-18 07:55:00  DRUG/NARCOTIC   
70930   2013-04-30 08:00:00  VEHICLE THEFT   
205172  2008-10-01 12:15:00        ASSAULT   
...                     ...            ...   
88696   2012-10-16 17:30:00        ASSAULT   
80620   2013-01-11 00:05:00  LARCENY/THEFT   
340215  2004-07-23 20:00:00  LARCENY/THEFT   
373466  2003-08-18 22:00:00  LARCENY/THEFT   
238195  2007-09-05 01:41:00  DRUG/NARCOTIC   

                                     Descript  DayOfWeek PdDistrict  \
299931        POSSESSION OF COCAINE FOR SALES     Friday  INGLESIDE   
6964             GRAND THEFT FROM LOCKED AUTO  Wednesday   SOUTHERN   
231301        POSSESSION OF BASE/ROCK COCAINE     Sunday    MISSION   
70930                       STOLEN AUTOMOBILE    Tuesday    MISSION   
205172                                BATTERY  Wednesday   NORTHERN   
...                  

In [3]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [5]:
# Dimensionalität entspricht der Anzahl der Features (Merkmale) in unserem Falle X Features.
pca = PCA(n_components=0.95)
# Das Argument n_components=0.95 in der PCA-Funktion von sklearn.decomposition gibt an,
# dass die Anzahl der Komponenten für die Hauptkomponentenanalyse (PCA) so gewählt werden soll,
# dass mindestens 95% der Varianz der ursprünglichen Daten erhalten bleiben. Es handelt sich
# hierbei um eine Form der Dimensionsreduktion, bei der nicht eine feste Anzahl von
# Hauptkomponenten vorgegeben wird, sondern die minimale Anzahl von Komponenten, die benötigt
# wird, um einen Großteil der Datenvarianz zu erfassen.
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(X_test_pca)

[[-2.17811818  0.49998378 -1.82548336 ... -0.21499777  0.05369218
   0.26723836]
 [ 2.3615514   0.25952073 -0.53229809 ...  0.06943277  0.14961234
  -0.11593392]
 [ 0.12710737 -0.4799879  -1.68518251 ...  0.30584022  0.67390384
  -0.50114048]
 ...
 [ 1.30843094 -1.08181308  0.44564086 ...  0.56242773  0.02878945
   0.22085783]
 [ 2.41584946  0.04438995  0.59736235 ...  0.17658566 -0.20993315
   0.41625992]
 [-1.89651347  3.5051379   0.50090224 ... -0.4717195   0.11344314
   0.01869985]]


In [None]:
# Kernel: linear
svm_clf_linear = SVC(kernel='linear',probability=True, gamma='auto')
k = 4
# Parallelisierung auf allen verfügbaren CPU-Kernen
n_jobs = multiprocessing.cpu_count()
# Führe Kreuzvalidierung mit den skalierten Trainingsdaten durch und gib die Genauigkeit aus
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'roc_auc_ovr': 'roc_auc_ovr'
}

# Kreuzvalidierung mit mehreren Metriken
cv_results = cross_validate(svm_clf_linear, X_train_scaled, y_train_encoded, cv=k, scoring=scoring, return_train_score=False, n_jobs=n_jobs)

# Ergebnisse ausgeben
for metric in scoring.keys():
    print(f"Durchschnittliche {metric}: {np.mean(cv_results[f'test_{metric}']) * 100:.2f}%")
    print(f"Standardabweichung {metric}: {np.std(cv_results[f'test_{metric}']) * 100:.2f}%")

In [None]:
svm_clf_linear.fit(X_train_scaled, y_train_encoded)
y_pred_svm_linear = svm_clf_linear.predict(X_test_scaled)
accuracy_svm_linear = accuracy_score(y_test_encoded, y_pred_svm_linear)
f1_svm_linear = f1_score(y_test_encoded, y_pred_svm_linear, average='weighted')
precision_svm_linear = precision_score(y_test_encoded, y_pred_svm_linear, average='weighted')
recall_svm_linear = recall_score(y_test_encoded, y_pred_svm_linear, average='weighted')

print("Metrics for linear kernel:")
print(f"Accuracy: {accuracy_svm_linear:.4f}, F1-Score: {f1_svm_linear:.4f}, Precision: {precision_svm_linear:.4f}, Recall: {recall_svm_linear:.4f}\n")

In [None]:
# Kernel: poly
svm_clf_poly = SVC(kernel='poly')
svm_clf_poly.fit(X_train, y_train)
y_pred_svm_poly = svm_clf_poly.predict(X_test)

accuracy_svm_poly = accuracy_score(y_test, y_pred_svm_poly)
f1_svm_poly = f1_score(y_test, y_pred_svm_poly, average='weighted')
precision_svm_poly = precision_score(y_test, y_pred_svm_poly, average='weighted')
recall_svm_poly = recall_score(y_test, y_pred_svm_poly, average='weighted')

# Ausgabe
print("Metrics for poly kernel:")
print(f"Accuracy: {accuracy_svm_poly:.4f}, F1-Score: {f1_svm_poly:.4f}, Precision: {precision_svm_poly:.4f}, Recall: {recall_svm_poly:.4f}\n")

In [2]:
# Kernel: rbf
svm_clf_rbf = SVC(kernel='rbf')
svm_clf_rbf.fit(X_train, y_train)
y_pred_svm_rbf = svm_clf_rbf.predict(X_test)

accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
f1_svm_rbf = f1_score(y_test, y_pred_svm_rbf, average='weighted')
precision_svm_rbf = precision_score(y_test, y_pred_svm_rbf, average='weighted')
recall_svm_rbf = recall_score(y_test, y_pred_svm_rbf, average='weighted')

# Ausgabe
print("Metrics for rbf kernel:")
print(f"Accuracy: {accuracy_svm_rbf:.4f}, F1-Score: {f1_svm_rbf:.4f}, Precision: {precision_svm_rbf:.4f}, Recall: {recall_svm_rbf:.4f}\n")

NameError: name 'SVC' is not defined

In [None]:
# Kernel: sigmoid
svm_clf_sigmoid = SVC(kernel='sigmoid')
svm_clf_sigmoid.fit(X_train, y_train)
y_pred_svm_sigmoid = svm_clf_sigmoid.predict(X_test)

accuracy_svm_sigmoid = accuracy_score(y_test, y_pred_svm_sigmoid)
f1_svm_sigmoid = f1_score(y_test, y_pred_svm_sigmoid, average='weighted')
precision_svm_sigmoid = precision_score(y_test, y_pred_svm_sigmoid, average='weighted')
recall_svm_sigmoid = recall_score(y_test, y_pred_svm_sigmoid, average='weighted')

# Ausgabe
print("Metrics for sigmoid kernel:")
print(f"Accuracy: {accuracy_svm_sigmoid:.4f}, F1-Score: {f1_svm_sigmoid:.4f}, Precision: {precision_svm_sigmoid:.4f}, Recall: {recall_svm_sigmoid:.4f}\n")