In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Mounting Drive to the Colab Notebook
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# We will read our CSV file from our Google Drive and store it in a variable called windowsData
import pandas as pd
path = "/content/drive/MyDrive/Colab Notebooks/Thesis/windows10_dataset.csv"
windowsData = pd.read_csv(path)
print(windowsData.shape)
windowsData.head()

In [None]:
#Viewing the shape and structure of our dataset/ counting rows and columns of the data set
windowsData.shape

In [None]:
#Viewing a portion of the dataset to learn more about it
windowsData.head(10)

In [None]:
# Counting the empty columns
windowsData.isna().sum().sum()

In [None]:
# #specify that all columns should be shown
# pd.set_option('max_columns', None)

# #view DataFrame
# windowsData

In [None]:
pd.set_option('display.max_rows', 129)

windowsData.dtypes

In [None]:
# Finding out Correlation between columns
windowsData.iloc[:,1:127].corr()

In [None]:
from sklearn.impute import SimpleImputer
impute = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)

In [None]:
# Imputing Columns 1 to 24
impute.fit(windowsData.iloc[:,1:126])
windowsData.iloc[:,1:126] = impute.transform(windowsData.iloc[:,1:126])

In [None]:
windowsData

In [None]:
impute = SimpleImputer(missing_values= ' ', strategy='constant', fill_value = -1)

In [None]:
# Imputing Columns 1 to 24
impute.fit(windowsData.iloc[:,1:126])
windowsData.iloc[:,1:126] = impute.transform(windowsData.iloc[:,1:126])

In [None]:
windowsData

In [None]:
windowsData.iloc[:,1:126] = (windowsData.iloc[:,1:126]).astype(float)

In [None]:
pd.set_option('display.max_rows', 129)

windowsData.dtypes

In [None]:
# Finding out Correlation between columns
windowsData.iloc[:,1:127].corr()

In [None]:

fig, ax = plt.subplots(figsize=(100,100))
sns.heatmap(windowsData.iloc[:,0:126].corr(),annot=True, linewidth = 10, ax=ax, fmt='.0%')

In [None]:
#sns.pairplot(windowsData.iloc[:,1:126], hue='label')

In [None]:
# n_samples = 100
# df_sample = windowsData.sample(n_samples , axis=1)
# sns.pairplot(df_sample, hue = 'label')

In [None]:
# from scipy.stats import spearmanr


# targetVar = windowsData['label']
# corr_threshold = 0.4

# corr = spearmanr(windowsData)
# corrSeries = pd.Series(corr[0][:,0], index=windowsData.columns) #Series with column names and their correlation coefficients
# corrSeries = corrSeries[(corrSeries.index != targetVar) & (corrSeries > corr_threshold)] #apply the threshold

# vars_to_keep = list(corrSeries.index.values) #list of variables to keep
# vars_to_keep.append(targetVar)  #add the target variable back in
# data2 = windowsData[vars_to_keep]

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=8, shuffle = True, random_state=3)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
# from sklearn.metrics import f1_score

In [None]:
# -------------------  Splitting Begins Here ------------------------------------------
# # Splitting the dataset into independent X and dependent Y
x = windowsData.drop(['ts','type','label'],axis=1)
y = windowsData['label']

# #Train and Test Split --- > Train : 80%, Test : 20%
# x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, random_state=22)
# print("Training dataset shape : ",x_train.shape)
# print("Testing dataset shape : ",x_test.shape)
x.head()

In [None]:
def get_accuracy_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred= model.predict(x_test)
    modelAccuracyTest = accuracy_score(y_pred,y_test)
    return modelAccuracyTest

In [None]:
def f1_score(y_true, y_pred):
    # Calculate true positives, false positives, and false negatives
    tp = sum((y_true == 1) & (y_pred == 1))
    fp = sum((y_true == 0) & (y_pred == 1))
    fn = sum((y_true == 1) & (y_pred == 0))
    
    # Calculate precision and recall
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    # Calculate and return F1 score
    return 2 * (precision * recall) / (precision + recall)

In [None]:
def get_f1_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred= model.predict(x_test)

    # # Calculate precision, recall, and F1 score
    # report = metrics.classification_report(y_test, y_pred)
    # print(report)

    modelF1Test = f1_score(y_pred,y_test)
    return modelF1Test

In [None]:
# --------------------------Random Forest -----------------------------------
from sklearn.ensemble import RandomForestClassifier
from statistics import mean

In [None]:
RF_accuracy_scores = []
RF_f1_scores = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test, y_train, y_test = x.iloc[train_index], x.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    RF_accuracy_scores.append(get_accuracy_score(RandomForestClassifier(), x_train, x_test, y_train, y_test))
    RF_f1_scores.append(get_f1_score(RandomForestClassifier(), x_train, x_test, y_train, y_test))

print('K-Fold Accuracy Score: ',mean(RF_accuracy_scores))
print('K-Fold F1 Score:', mean(RF_f1_scores))

In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np
#Seeing the value count before oversampling
unique, count = np.unique(y_train, return_counts = True)
y_train_dict_value_count = {k:v for (k,v) in zip(unique, count)}
print("Before oversampling",y_train_dict_value_count)

sm = SMOTE(random_state=12)
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

# Seeing the value count after oversampling
unique, count = np.unique(y_train_sm, return_counts = True)
y_train_smote_value_count = {k:v for (k,v) in zip(unique, count)}
print("After oversampling",y_train_smote_value_count)

In [None]:
RF_accuracy_scores = []
RF_f1_scores = []

for train_index, test_index in skf.split(x_train_sm, y_train_sm):
    x_train, x_test, y_train, y_test = x_train_sm.iloc[train_index], x_train_sm.iloc[test_index], y_train_sm.iloc[train_index], y_train_sm.iloc[test_index]

    RF_accuracy_scores.append(get_accuracy_score(RandomForestClassifier(), x_train, x_test, y_train, y_test))
    RF_f1_scores.append(get_f1_score(RandomForestClassifier(), x_train, x_test, y_train, y_test))

print('Accuracy Score after oversampling: ',mean(RF_accuracy_scores))
print('F1 Score after oversampling:', mean(RF_f1_scores))