# Importing Libraries

In [None]:
# Basic libraries for analysis and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn library for preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Sklearn library for model building
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import classification_report, confusion_matrix
from sklearn_evaluation.plot import grid_search

# Ignoring warnings
import warnings

warnings.filterwarnings('ignore')

# Loading The Datasets

In [None]:
df1 = pd.read_csv("triple/data1.csv")
df2 = pd.read_csv("triple/data2.csv")
df3 = pd.read_csv("triple/data3.csv")
df4 = pd.read_csv("triple/data4.csv")
df5 = pd.read_csv("triple/data5.csv")
df6 = pd.read_csv("triple/data6.csv")
df7 = pd.read_csv("triple/data7.csv")
df8 = pd.read_csv("triple/data8.csv")
df9 = pd.read_csv("triple/data9.csv")
df10 = pd.read_csv("triple/data10.csv")
df11 = pd.read_csv("triple/data11.csv")
df12 = pd.read_csv("triple/data12.csv")
df13 = pd.read_csv("triple/data13.csv")
df14 = pd.read_csv("triple/data14.csv")
df15 = pd.read_csv("triple/data15.csv")

# Data Preparation: Cleaning Each Dataset

Now that we have all DataFrames, it's time to inspect them to see if they need any cleaning. Let's look into the dataset one-by-one.

In [None]:
df1.shape

In [None]:
df1.info()

The dataset has 4966 observations and 129 columns where 112 columns have float64 datatype, 16 columns have int64 datatype and 1 column has object datatype. Since info function does not let us see the each column description, we will take a look at the 15 observations of the dataset.



In [None]:
df1.head()

Lets inspect other datasets to see the posibility of merging

In [None]:
df2.info()

In [None]:
df3.info()

In [None]:
df4.info()

In [None]:
df5.info()

In [None]:
df6.info()

In [None]:
df7.info()

In [None]:
df8.info()

In [None]:
df9.info()

In [None]:
df10.info()

In [None]:
df11.info()

In [None]:
df12.info()

In [None]:
df13.info()

In [None]:
df14.info()

In [None]:
df15.info()

All the data have same number of features, therefore we can merge the datasets

In [None]:
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15])
print(df.shape)
df.head()

# Exploratory Data Analysis

In [None]:
df.shape

According to https://sites.google.com/a/uah.edu/tommy-morris-uah/ics-data-sets?pli=1, the 129 colums are 29 types of measurements from each phasor measurement units (PMU). In the power system there are 4 PMUs which measure 29 features for 116 PMU measurement columns total. Each column is in the form of “R#-Signal Reference” that indicates a type of measurement from a PMU specified by “R#”. 

The "marker" column which contain a three class categorical data (NoEvent, Natural, and Attack) are the target features.

Since the dataset is very large with 78,377 observations, this will affect the computational speed of our analysis and model building. A random sample will be selected from the dataset without altering the distribution of the target (marker). Therefore, we can count the three class marker and randomly select 30% of the dataset. So first we look at the summary of the data.

In [None]:
# Summary of the dataset
df.describe()

The statistics of the dataset show varying weight of the features from high negative to high positive values. Several dataset has very low standard deviation. Let's see how balance the target variable is

In [None]:
df['marker'].value_counts()

Let's check the percentage of each marker 

In [None]:
print("Percentage of Attack marker: ", round((df['marker'].value_counts()['Attack']/(sum(df['marker'].value_counts())))*100,2),"%")
print("Percentage of Natural marker: ", round((df['marker'].value_counts()['Natural']/(sum(df['marker'].value_counts())))*100,2),"%")
print("Percentage of NoEvents marker: ", round((df['marker'].value_counts()['NoEvents']/(sum(df['marker'].value_counts())))*100,2),"%")

Therefore our sample should approximately have similar percentage of the target values.

In [None]:
new_df = df.sample(frac=0.3)
new_df.head()

Let's check the new distribution of the sampled data

In [None]:
# Counting the markers
new_df['marker'].value_counts()

In [None]:
# Computing the percentage of the target values
print("Percentage of Attack marker: ", round((new_df['marker'].value_counts()['Attack']/(sum(new_df['marker'].value_counts())))*100,2),"%")
print("Percentage of Natural marker: ", round((new_df['marker'].value_counts()['Natural']/(sum(new_df['marker'].value_counts())))*100,2),"%")
print("Percentage of NoEvents marker: ", round((new_df['marker'].value_counts()['NoEvents']/(sum(new_df['marker'].value_counts())))*100,2),"%")

Since the percentage is approximately similar to the original dataset, this means that the distribution is largely the same.

Now, we can analyse the dataset.

In [None]:
# Shape of sampled dataset
new_df.shape

In [None]:
# Checking for null values
new_df.isnull().sum().sum()

There are no null values in dataset, but let's take a look at what the numerical values of the dataset looks like

In [None]:
# Summary of dataset
new_df.describe()

Lets See how the features are related

In [None]:
# Computing the correlation matrix of the explanatory features
new_df.corr()

The matrix shows the correlation of some of the features as a NaN value.

In this instance, the NaN is taken to mean that there is no relation between the two variables. The presence of nan values in the output may arise for a variety of reasons. In our case, when the standard deviation of one feature is zero, it generates a NaN value as dividing the covariance of the two feature with zero is mathematically incorrect.

Therefore, we will drop all columns with 0 standard deviation as it does not add any importance when predicting on the dataset.

In [None]:
# Dropping columns with 0 sstandard deviation.
stds = pd.DataFrame(new_df.describe().loc['std']).T

cols = []
for column in stds.columns:
    if stds[column].item() == 0.0:
        cols.append(column)
        new_df.drop(column, inplace=True, axis=1)
print('The columns dropped are: ', cols)
new_df.shape

To visualize the correlation on the heatmap, we do it in batches as the screen is too small to accommodate the large feature correlation matrix. Since we have 125 columns, we divide the visualization into 5 batches.

In [None]:
# Visualizing correlation matrix of first 25 columns of the dataset on a heatmap
plt.figure(figsize=(30,30))
sns.heatmap(new_df.iloc[:, 0:25].corr(), annot=True)

In [None]:
# Visualizing correlation matrix of first 25 columns of the dataset on a heatmap
plt.figure(figsize=(30,30))
sns.heatmap(new_df.iloc[:, 25:50].corr(), annot=True)

In [None]:
# Visualizing correlation matrix of first 25 columns of the dataset on a heatmap
plt.figure(figsize=(30,30))
sns.heatmap(new_df.iloc[:, 50:75].corr(), annot=True)

In [None]:
# Visualizing correlation matrix of first 25 columns of the dataset on a heatmap
plt.figure(figsize=(30,30))
sns.heatmap(new_df.iloc[:, 75:100].corr(), annot=True)

In [None]:
# Visualizing correlation matrix of first 25 columns of the dataset on a heatmap
plt.figure(figsize=(30,30))
sns.heatmap(new_df.iloc[:, 100:125].corr(), annot=True)

The heatmaps for all the batches shows that several columns have high correlation. As high as 0.97 for some and this is not good for the dataset as it is just extra noise in the dataset. If two variables are correlated, we can predict one from the other. Therefore, if two features are correlated, the model only really needs one of them, as the second one does not add additional information. We need to set an absolute value threshold for selecting the variables. If we find that the predictor variables are correlated among themselves, we can drop the variable which has a lower correlation coefficient value with the target variable. This is done in the feature Engineering section.

# Univariate Analysis of the Dataset

We will perform a univariate analysis on the dataset to observe how each feature is dristributed. 

The first trial of building a histogram, we observe that the dataset contains infinite values that pandas read as a floating value. therefore, let's handle the infinite values to allow us properly visualise the dataset.

In [None]:
np.isinf(new_df.iloc[:,:-1]).values.sum()

The number of infinite values are too large that we cannot drop them as it might distort the dataset. So we will replace the infinite values with the mean of the each feature containing the infinite values. First we will replace the values with Null values then with the mean values.

In [None]:
# Replacing infinite values with NaN
new_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Checking if infinite values have been replaced to null values
new_df.isnull().sum().sum()

In [None]:
# Apply imputer to the dataset to input the mean value.
imputer = SimpleImputer(strategy='mean')
new_df.loc[:, new_df.columns != "marker"] = imputer.fit_transform(new_df.loc[:, new_df.columns != "marker"])

new_df.isnull().sum().sum()

We can now visualize the distribution of some of the features using a histogram

In [None]:
# Visualizing ficolumns of the dataset
new_df.iloc[:, 0:30].hist(figsize=(20,20))
plt.show()

In [None]:
new_df.iloc[:, 30:60].hist(figsize=(20,20))
plt.show()

Several features in the dataset are very densed and with very low variance. These features tend to contribute less to the prediction of the target variable. Therefore it will be handled during the feature engineering of the dataset.

# Bivariate analysis 

Next we investigate the relationship between some explanatory variable and the target variable. Creating a grouped bar chart.

In [None]:
new_df.columns

In [None]:
# Grouped boxplot chart
rows=22
cols=6
fig = plt.figure(figsize=(30,60))
for i in range(0, len(new_df.columns) - 1):
    cat = "marker"
    num = new_df.columns[i]
    ax=fig.add_subplot(rows, cols, i+1)
    sns.boxplot(x = cat, y = num, data=new_df, palette='GnBu', ax=ax)
plt.show()

Each box shows how spread out the data is within group and putting boxes side by side indicates the difference among groups. It is aligned with ANOVA test which also analyze the degree of variance between-group compared to within-group. 

# Data Preprocessing

Lets start by checking for outliers

In [None]:
# Grouped boxplot chart
rows=22
cols=6
df_cols = new_df.columns[0:57]
fig = plt.figure(figsize=(30,60))
for i, col in enumerate(df_cols):
    ax=fig.add_subplot(rows, cols, i+1)
    sns.boxplot(x=new_df[col], ax=ax)
plt.show()

During the exploratory data analysis, we discovered that there were only "infinity" as missing values in the dataset and we handled that to be able to visualize the dataset. Also the boxplots shows a lot of outliers. Dropping or trimm the outliers will cause the removal of a large number of records from your dataset which isn’t desirable. Therefore we are going to cap the dataset to minimize the outliers. 

In [None]:
new_df.shape

In [None]:
upper_limit = new_df.iloc[:,0:new_df.shape[1]].mean() + 3*new_df.iloc[:,0:new_df.shape[1]].std()
lower_limit = new_df.iloc[:,0:new_df.shape[1]].mean() - 3*new_df.iloc[:,0:new_df.shape[1]].std()

In [None]:
new_df.iloc[:,0:new_df.shape[1]-1] = np.where(new_df.iloc[:,0:new_df.shape[1]-1] > upper_limit,upper_limit,
                      np.where(new_df.iloc[:,0:new_df.shape[1]-1] < lower_limit,lower_limit,
                      new_df.iloc[:,0:new_df.shape[1]-1]))

In [None]:
# Grouped boxplot chart
rows=22
cols=6
df_cols = new_df.columns[0:-1]
fig = plt.figure(figsize=(30,60))
for i, col in enumerate(df_cols):
    ax=fig.add_subplot(rows, cols, i+1)
    sns.boxplot(x=new_df[col], ax=ax)
plt.show()

Although the outliers are not displaced, but they have been minimized.

# Declare feature vector and target variable

The explanatory varables consists of remaining 57%, while marker is the target variable.

In [None]:
# Feature vector
X = new_df.drop(['marker'], axis=1).copy()
X.head()

In [None]:
# target variable
y = new_df['marker'].reset_index(drop=True)
y.head()

In [None]:
# Encoding target variable
Encoder=LabelEncoder()
y=Encoder.fit_transform(y)
np.unique(y)

Due to the complexity of the feature vector and the vast difference in the variance, we scale the features

In [None]:
# Transforming the data to attain internal consistency
scalar = StandardScaler()
scalar.fit(X)
X = scalar.transform(X)
X

# Model  creation  and  evaluation

Using sckit-learn library, three models (Random Forest, Logistics Regression, and K Nearest Neighborhood) are constructed with 10-fold cv and are compared. The metrics to be used in the comparison will be as follows:

Due to a high class imbalance (Percentage of Attack marker:  71.2%, Percentage of Natural marker:  23.45%, Percentage of NoEvents marker:  5.35%), a weighted recall, precision and f1 score (which considers both precision and recall) will be used in evaluating the model. the weighted metric, accounts for class imbalance by computing the average of binary metrics weighted by the number of samples of each class in the target.

https://towardsdatascience.com/comprehensive-guide-on-multiclass-classification-metrics-af94cfb83fbd

# Random Forest

In [None]:
# define the model
rf_cv = RandomForestClassifier()

# evaluate the model
cv = StratifiedKFold(n_splits=10)

# Using crossvalidation of 10 splits to evaluate f1 value
scores = cross_validate(rf_cv, X, y, cv=cv, scoring=['precision_weighted', 'recall_weighted', 'f1_weighted'])

print('Precision value for the cross validation of Random Forest classifier is: ', np.mean(scores['test_precision_weighted']) , '\n')
print('Recall value for the cross validation of Random Forest classifier is: ', np.mean(scores['test_recall_weighted']) , '\n')
print('F1 value for the cross validation of Random Forest classifier is: ', np.mean(scores['test_f1_weighted']) , '\n')

# Logistics Regression

In [None]:
# define the model
lg_cv = LogisticRegression()

# evaluate the model
cv = StratifiedKFold(n_splits=10)

# Using crossvalidation of 10 splits to evaluate f1 value
scores1 = cross_validate(lg_cv, X, y, cv=cv, scoring=['precision_weighted', 'recall_weighted', 'f1_weighted'])

print('Precision value for the cross validation of Logistic Regression is: ', np.mean(scores1['test_precision_weighted']) , '\n')
print('Recall value for the cross validation of Logistic Regression is: ', np.mean(scores1['test_recall_weighted']) , '\n')
print('F1 value for the cross validation of Logistic Regression is: ', np.mean(scores1['test_f1_weighted']) , '\n')

# K Nearest Neighborhood

In [None]:
# define the model
knn_cv = KNeighborsClassifier()

# evaluate the model
cv = StratifiedKFold(n_splits=10)

# Using crossvalidation of 10 splits to evaluate f1 value
scores2 = cross_validate(knn_cv, X, y, cv=cv, scoring=['precision_weighted', 'recall_weighted', 'f1_weighted'])

print('Precision value for the cross validation of KNeighbors Classifier is: ', np.mean(scores2['test_precision_weighted']) , '\n')
print('Recall value for the cross validation of KNeighbors Classifier is: ', np.mean(scores2['test_recall_weighted']) , '\n')
print('F1 value for the cross validation of KNeighbors Classifier is: ', np.mean(scores2['test_f1_weighted']) , '\n')

Let's Visualize the performance of the models according to the k-folds

In [None]:
# Lets see what the accuracy looked like during the crossvalidation process
plt.figure(figsize=(10,5))
num_k = range(10)

plt.plot(num_k, scores['test_precision_weighted'], label='Random Forest')
plt.plot(num_k, scores1['test_precision_weighted'], label='Logistics Regression')
plt.plot(num_k, scores2['test_precision_weighted'], label='K Nearest Neighborhood')
plt.legend(bbox_to_anchor = (1,1.3))
plt.title('Weighted Precision Scores of the Three Models')
plt.xlabel('Number of validations')
plt.ylabel('Accuracy')

In [None]:
# Lets see what the accuracy looked like during the crossvalidation process
plt.figure(figsize=(10,5))
num_k = range(10)

plt.plot(num_k, scores['test_f1_weighted'], label='Random Forest')
plt.plot(num_k, scores1['test_f1_weighted'], label='Logistics Regression')
plt.plot(num_k, scores2['test_f1_weighted'], label='K Nearest Neighborhood')
plt.legend(bbox_to_anchor = (1,1.3))
plt.title('Weighted Recall Scores of the Three Models')
plt.xlabel('Number of validations')
plt.ylabel('Accuracy')

In [None]:
# Lets see what the accuracy looked like during the crossvalidation process
plt.figure(figsize=(10,5))
num_k = range(10)

plt.plot(num_k, scores['test_recall_weighted'], label='Random Forest')
plt.plot(num_k, scores1['test_recall_weighted'], label='Logistics Regression')
plt.plot(num_k, scores2['test_recall_weighted'], label='K Nearest Neighborhood')
plt.legend(bbox_to_anchor = (1,1.3))
plt.title('Weighted Recall Scores of the Three Models')
plt.xlabel('Number of validations')
plt.ylabel('Accuracy')

The three evaluation metrics shows that Random Forest classification performed better than the the other two models, followed by K Nearest Neighborhood and then the Logistics Regression models.

# Feature Engineering

During exploratory data analysis, we discovered that several features are highly correlated. therefore, we first drop one of such features that have correlation < -0.9 and > 0.9

In [None]:
# select upper traingle of correlation matrix
corr_matrix = new_df.corr()
upper = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))).abs()
print(upper)

# Find index of columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
print(to_drop)

# drop the columns
new_df = new_df.drop(columns=to_drop, axis=1)
new_df.shape

From the number of features left, we will use mutual information gain to further select impoortant features. Mutual information from the field of information theory is the application of information gain (typically used in the construction of decision trees) to feature selection.

Mutual information is calculated between two variables and measures the reduction in uncertainty for one variable given a known value of the other variable. A threshold of 0.01 is given for the selection.

https://machinelearningmastery.com/feature-selection-with-numerical-input-data/

In [None]:
# the number of most relevant features
new_df = new_df.drop(['marker'], axis=1)
high_score_features = []
feature_scores = mutual_info_classif(new_df, y, random_state=0)
for score, f_name in sorted(zip(feature_scores, new_df.columns), reverse=True):
        if score <= 0.01:
                high_score_features.append(f_name)
                print(f_name, score)

#Representing in list form
mutual_info = pd.Series(feature_scores)
mutual_info.index = new_df.columns[0:58]

#plot the ordered mutual_info values per feature
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))

In [None]:
selected_X = new_df[high_score_features]
selected_X.head()

# Rebuilding model 

Rescale the features since the initial X was not used to select best features

In [None]:
# Transforming the data to attain internal consistency
scalar = StandardScaler()
scalar.fit(selected_X)
selected_X = scalar.transform(selected_X)
selected_X

# Random Forest

In [None]:
# define the model
rf_cv = RandomForestClassifier()

# evaluate the model
cv = StratifiedKFold(n_splits=10)

# Using crossvalidation of 10 splits to evaluate f1 value
scores0 = cross_validate(rf_cv, selected_X, y, cv=cv, scoring=['precision_weighted', 'recall_weighted', 'f1_weighted'])

print('Precision value for the cross validation of Random Forest classifier with selected feature is: ', np.mean(scores0['test_precision_weighted']) , '\n')
print('Recall value for the cross validation of Random Forest classifier with selected feature is: ', np.mean(scores0['test_recall_weighted']) , '\n')
print('F1 value for the cross validation of Random Forest classifier with selected feature is: ', np.mean(scores0['test_f1_weighted']) , '\n')

# Logistics Regression

In [None]:
# define the model
lg_cv = LogisticRegression()

# evaluate the model
cv = StratifiedKFold(n_splits=10)

# Using crossvalidation of 10 splits to evaluate f1 value
scores10 = cross_validate(lg_cv, selected_X, y, cv=cv, scoring=['precision_weighted', 'recall_weighted', 'f1_weighted'])

print('Precision value for the cross validation of Logistic Regression with selected feature is: ', np.mean(scores10['test_precision_weighted']) , '\n')
print('Recall value for the cross validation of Logistic Regression with selected feature is: ', np.mean(scores10['test_recall_weighted']) , '\n')
print('F1 value for the cross validation of Logistic Regression with selected feature is: ', np.mean(scores10['test_f1_weighted']) , '\n')

# K Nearest Neighborhood

In [None]:
# define the model
knn_cv = KNeighborsClassifier()

# evaluate the model
cv = StratifiedKFold(n_splits=10)

# Using crossvalidation of 10 splits to evaluate f1 value
scores20 = cross_validate(knn_cv, selected_X, y, cv=cv, scoring=['precision_weighted', 'recall_weighted', 'f1_weighted'])

print('Precision value for the cross validation of KNeighbors Classifier with selected feature is: ', np.mean(scores20['test_precision_weighted']) , '\n')
print('Recall value for the cross validation of KNeighbors Classifier with selected feature is: ', np.mean(scores20['test_recall_weighted']) , '\n')
print('F1 value for the cross validation of KNeighbors Classifier with selected feature is: ', np.mean(scores20['test_f1_weighted']) , '\n')

Visualizing the model rebuilt with the selected features

In [None]:
# Lets see what the accuracy looked like during the crossvalidation process
plt.figure(figsize=(10,5))
num_k = range(10)

plt.plot(num_k, scores0['test_precision_weighted'], label='Random Forest')
plt.plot(num_k, scores10['test_precision_weighted'], label='Logistics Regression')
plt.plot(num_k, scores20['test_precision_weighted'], label='K Nearest Neighborhood')
plt.legend(bbox_to_anchor = (1,1.3))
plt.title('Weighted Precision Scores of the Three Models')
plt.xlabel('Number of validations')
plt.ylabel('Accuracy')

In [None]:
# Lets see what the accuracy looked like during the crossvalidation process
plt.figure(figsize=(10,5))
num_k = range(10)

plt.plot(num_k, scores0['test_recall_weighted'], label='Random Forest')
plt.plot(num_k, scores10['test_recall_weighted'], label='Logistics Regression')
plt.plot(num_k, scores20['test_recall_weighted'], label='K Nearest Neighborhood')
plt.legend(bbox_to_anchor = (1,1.3))
plt.title('Weighted Recall Scores of the Three Models')
plt.xlabel('Number of validations')
plt.ylabel('Accuracy')

In [None]:
# Lets see what the accuracy looked like during the crossvalidation process
plt.figure(figsize=(10,5))
num_k = range(10)

plt.plot(num_k, scores0['test_f1_weighted'], label='Random Forest')
plt.plot(num_k, scores10['test_f1_weighted'], label='Logistics Regression')
plt.plot(num_k, scores20['test_f1_weighted'], label='K Nearest Neighborhood')
plt.legend(bbox_to_anchor = (1,1.3))
plt.title('Weighted Recall Scores of the Three Models')
plt.xlabel('Number of validations')
plt.ylabel('Accuracy')

It is observed that the performance of the Random Forest classifier reduced when the features were reduced. Although on a average, it is still the best model in the set, but the other two models performed almost the same as well. These performances shows that some percentage of importance from the features was lost, but the noise was reduced.

# Hyper parameter tuning

From observation, the best performing model is the Random Forest that used all the features in the dataset. Therefore to get a good parameter for the model, me check on the n_estimators and max_depth of the Random forest classifier.

In [None]:
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_depth' : [4,5,6,7,8],
}



In [None]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
CV_rfc.fit(X, y)

In [None]:

CV_rfc.best_params_

In [None]:
# Using best parameters to rebuild model
rfc1=RandomForestClassifier(random_state=42, n_estimators= 500, max_depth=8)

In [None]:
# Fit new model
rfc1.fit(X,y)

In [None]:
# Prediction and evaluation
y_pred=rfc1.predict(X)
print(classification_report(y, y_pred))

In [None]:
cv_data = {'Random Forest without tuning':{'precision':0.86,'Recall':0.86,'F1_score':0.85},\
            'Random Forest with tuning':{'precision':0.78,'Recall':0.75,'F1_score':0.68},\
            }

cv_acc = [[0.85, 0.85, 0.84],[0.75 , 0.73, 0.65]]
columns = ['Precision', 'Recall', 'F1_score']
rows = ['Random Forest without tuning', 'Random Forest with tuning']

# plt.figure(figsize=(10,5))
pd.DataFrame(cv_data).plot(kind='bar', figsize=(14,6))
table = plt.table(cellText=cv_acc, rowLabels=rows, colLabels=columns)

# make space for the table:
plt.subplots_adjust(left=0.2, bottom=0.2)
plt.ylabel("Accuracy Value".format(200))
plt.legend(bbox_to_anchor = (1,1.3))
plt.xticks([])
plt.title('Accuracy of Random Forest, with and without tunning')

The bar plot shows that the default parameters of the dataset performed better.