# Import Modules

In [None]:
%load_ext autoreload
%autoreload
%pylab inline
import os
import glob
import json
import pandas as pd
import pickle

#from IPython import embed
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 100)

In [None]:
from helper_functions import *

# Read and Process Excel Sheets (Organized by Fuel Type and Labeled as Burnable or Unburnable)

In [None]:
lowg_data_base = 'raw_data'

### Read the processed excel sheet with alerts and confusion matrix

In [None]:
alert_excel_processed = pd.ExcelFile(os.path.join(lowg_data_base, 'ground_truth_data.xlsx'))

In [None]:
sheet_to_df_map_read = {}

In [None]:
features_to_use = ['feature01', 'feature02', 'feature03', 'feature04','feature05', 'feature06', 'feature07']

In [None]:
for sheet in alert_excel_processed.sheet_names:
    df_read = pd.DataFrame(pd.read_excel(alert_excel_processed, sheet))
    if sheet == 'Summary':
        sheet_to_df_map_read[sheet] = df_read
    else:            
        sheet_to_df_map_read[sheet] = df_read[['SheetName'] + features_to_use + ['Label']]
del sheet_to_df_map_read['Summary']

In [None]:
sheet_to_df_map_read.keys()
#sheet_to_df_map_read['Summary']

In [None]:
sheet_to_df_map_read['Fuel01'].head(5)

In [None]:
sheet_to_df_map_read['Fuel06'].head(5)

### Define features and labels

In [None]:
features = ['feature01', 'feature02', 'feature03', 'feature04','feature05', 'feature06', 'feature07']
labels = ['Label', 'NewLabel', 'SheetName']
cols = labels + features

### Re-Label sheets and combine and prune them

In [None]:
sheet_to_df_map = re_label_data(sheet_to_df_map_read)
df_combined = combine_df_sheets(sheet_to_df_map, sorted(sheet_to_df_map.keys()), cols)
df_combined_orig = df_combined
len(df_combined)

In [None]:
len(df_combined)
#df_combined.keys()

# SVM Stuff

### Scikit-Learn Imports

In [None]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, average_precision_score
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_curve, classification_report
from time import time

### Work with Multiple Features of Interest

In [None]:
features = ['feature01', 'feature02', 'feature03', 'feature04','feature05', 'feature06', 'feature07']

#### Get Features and Labels for Training and Testing

In [None]:
X, y = get_features_labels(df_combined, features_to_use)
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=42)
sheets_train, sheets_test, labels_train, labels_test = split_labels_sheets(labels_train, labels_test)
len(features_train), len(features_test), len(labels_train), len(labels_test)

#### Create and Train the Model

In [None]:
########################## SVM #################################
# Training Time is significantly more with large values of C
clf = SVC(kernel="linear", class_weight = "balanced") #class_weight

### fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train)
print ("Training Time:", round(time()-t0, 3), "s")

#### Save the Model to a File

In [None]:
# save the model to disk
'svm_N1205_T080_C1 --> svm_N(number of total data)_T(percentage for training)_C(model parameter in model constructor, default 1)'
svm_model_file = 'models_trained/svm_N1205_T080_C1'
pickle.dump(clf, open(svm_model_file, 'wb'))

#### Load the Model from a File (A template...use specific model for specific pupose)

In [None]:
# load the model from disk

## Predict on the Train Data itself using SVM

In [None]:
clf = pickle.load(open('models_trained/svm_N1205_T080_C1', 'rb'))
t1 = time()
labels_svm_pred = clf.predict(features_train)
print ("Prediction Time:", round(time()-t1, 3), "s")
#print ('labels_svm_pred: {}'.format(labels_svm_pred))

accuracy = accuracy_score(labels_svm_pred, labels_train)
print('Accuracy Score: {}'.format(accuracy))

conf_mat = confusion_matrix(labels_train, labels_svm_pred, labels = [1, 0])
print('Confusion Matrix: \n{}'.format(conf_mat))

print('Classification Report: \n')
print(classification_report(labels_train, labels_svm_pred, labels=[1, 0]))

average_precision = average_precision_score(labels_train, labels_svm_pred)
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

## Predict on the Test Data using SVM

In [None]:
### use the trained classifier to predict labels for the test features using SVM predict
clf = pickle.load(open('models_trained/svm_N1205_T080_C1', 'rb'))
t1 = time()
labels_svm_pred = clf.predict(features_test)
print ("Prediction Time:", round(time()-t1, 3), "s")
#print ('labels_svm_pred: {}'.format(labels_svm_pred))

accuracy = accuracy_score(labels_svm_pred, labels_test)
print('Accuracy Score: {}'.format(accuracy))

conf_mat = confusion_matrix(labels_test, labels_svm_pred, labels = [1, 0])
print('Confusion Matrix: \n{}'.format(conf_mat))

print('Classification Report: \n')
print(classification_report(labels_test, labels_svm_pred, labels=[1, 0]))

average_precision = average_precision_score(labels_test, labels_svm_pred)
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

"""
precision, recall, thresholds = precision_recall_curve(labels_test, pred)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
"""