## Maksymilian Drzezdzon C15311966 Exam

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, learning_curve, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer, cohen_kappa_score
from statistics import mean

from sklearn.utils.multiclass import unique_labels

from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier 

# Load Data

In [None]:

train_data = pd.read_csv('../data.csv', delimiter=',')

# Prepare values for training data
labels = train_data.pop('Class').values
data = train_data.values

print('Data load complete')

# Data Preprocessing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=xxx)

#sc = StandardScaler()

#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

# Param Grid Search

In [None]:
param_grid = [
    {'C': [1, 10, 100], 'kernel': ['poly'], 'gamma': [0.001, 0.0001, 'scale', 'auto'], 
     'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 21, 22, 23, 24, 25], 
     'cache_size': [200], 
     'coef0': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
 ]

clf = svm.SVC()
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring = 'accuracy')
svm_gs = grid_search.fit(X_train, y_train)

best_acc = grid_search.best_score_
best_params = grid_search.best_params_
print("best acc ", round(best_acc*100, 3))
print("best params ", best_params)

# Evaluation Metrics

In [None]:
####################
# Model Evaluation #
####################
print ('Model Evaluation')
# Model Accuracy: how often is the classifier correct
print("Model Accuracy:", round(metrics.accuracy_score(y_test, m_svm), 3))
# Model Precision: what percentage of positive tuples are labeled as such
print("Model Precision:", round(metrics.precision_score(y_test, m_svm), 3))
# Model Recall: what percentage of positive tuples are labelled as such
print("Model Recall:", round(metrics.recall_score(y_test, m_svm), 3))
# Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
print('F1 Score: ', round(f1_score(y_test, m_svm, average="macro"), 3))

print('Cohens Kappa :', round(cohen_kappa_score(y_test , m_svm), 3))
# Combination of Accuracy, Precision, Recall 
print("Classification Report :\n", classification_report(y_test,m_svm))

cvs = cross_val_score(svm_gs, data, labels, cv=10)

print('K-fold Cross Validation scores:')
print('Max Score: ', round(max(cvs), 3))
print('Min Score: ', round(min(cvs), 3))
print('Mean Score :', round(mean(cvs), 3))

##############################
# Testing model on test data #
##############################

X_test, y_test = train_test_split(test_data_df, test_size=1, random_state=104) # 70% training and 30% testprint("after split check", len(X_test),  len(y_test))
#scaled_test = sc.fit_transform(X_test)
#scal = StandardScaler().fit(test_data_df)
#test = scal.transform(test_data_df)


m_svm = svm_gs.predict(test_data_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, 
                                                    test_size=0.3, 
                                                    random_state=104) 
#X_test = sc.transform(X_test)

m_svm = svm_gs.predict(X_test)

fpr, tpr, threshold = metrics.roc_curve(y_test, m_svm)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Learn rate Curve
# uses k-cross validation = 5 by default
train_sizes, train_scores, test_scores = learning_curve(estimator = svm_gs, 
                                                        X = data, 
                                                        y = labels,
                                                        cv=5,
                                                        scoring='accuracy',
                                                        n_jobs=-1)

plt.plot(train_sizes, np.mean(train_scores, axis=1))
plt.title("Learn Curve for SVM Model")
plt.xlabel("Experiance Gained")
plt.ylabel("Accuracy Score")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                    test_size=0.3,
                                                    random_state=100) 

# Predict the response for test dataset
# sc = StandardScaler()
# x_train = sc.fit_transform(X_train)
# x_test = sc.transform(X_test)

clf=RandomForestClassifier(criterion='entropy')
clf.fit(X_train,y_train)

m_random_forest = clf.predict(X_test)

feature_imp = pd.Series(clf.feature_importances_, index=train_data.columns).sort_values(ascending=False)

# keep top 15 values
top_ten_features = feature_imp.nlargest(15, keep='all')

%matplotlib inline
# Creating a bar plot
sns.barplot(x=top_ten_features, y=top_ten_features.index)
# labels
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
# the legend works but takes up space and isnt needed
# plt.legend(top_ten_features.keys())
plt.show()

##############################
# Testing model on test data #
##############################
top_ten_labels = list(train_data[top_ten_features.keys()].keys())
top_ten_data = train_data[top_ten_features.keys()].values
# extracted 15 features 
train_data_2 = train_data[top_ten_labels]
