# Exploratory Data Analysis: FIFA World Cup 2018







![](https://www.myuganda.co.ug/wp-content/uploads/2018/06/2018-Fifa-Worldcup-in-Russia.jpg)

Importing the required packages.

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

import os
print(os.listdir("../input/"))

### A Look At The Dataset
    

In [None]:
data_file = pd.read_csv('../input/FIFA 2018 Statistics.csv')

# Creating a new Feature 'Result' to depict whether the match was a {'Win':1, 'Draw':0, 'Loss':-1} 
data_file['Result']=np.nan
for i in range(0,128,2):
    a_goals=data_file.iloc[i]['Goal Scored']
    b_goals=data_file.iloc[i+1]['Goal Scored']
    if a_goals>b_goals:
        data_file.iloc[i,-1]=1
        data_file.iloc[i+1,-1]=-1
    elif b_goals>a_goals:
        data_file.iloc[i,-1]=-1
        data_file.iloc[i+1,-1]=1
    else:
        data_file.iloc[i,-1]=0
        data_file.iloc[i+1,-1]=0
        
#Changing the categorical feature 'Man of the Match' :'Yes', 'No' to 1, 0
data_file.rename(columns={'Man of the Match':'MoM'},inplace=True)
data_file.MoM = data_file.MoM.replace(['Yes','No'],[1,0])

data_file.head()  #displays the first 5 rows of the dataset

In [None]:
data_file.info()

In [None]:
missing_df = pd.DataFrame()
missing_df['Feature'] = data_file.isnull().sum().sort_values(ascending = False).index
missing_df['Count of Missing Values'] = data_file.isnull().sum().sort_values(ascending = False).values
missing_df['Percentage of Missing Values'] = missing_df['Count of Missing Values']*100 / len(data_file)
missing_df

In [None]:
data_file['Own goals'].fillna(0, inplace = True)
data_file['Own goal Time'].fillna(0, inplace = True)
data_file['1st Goal'].fillna(0, inplace = True)

#### **Understanding the Features**

1. **Date**: Match Date.
2. **Team**: Playing Team.
3. **Opponent**: Opponent Team.
4. **Goal Scored**: Number of goals scored by playing team.
5. **Ball Possession %**: Amount of time the ball was in control of the playing team.
6. **Attempts**: Number of attempts to score a goal.
7. **On-Target**: Number of shots that were on-target.
8. **Off-Target**: Number of shots that were off-target.
9. **Blocked**: Number of opponent team's shots blocked.
10. **Corners**: Number of corner shots.
11. **Offsides**: Number of offside plays.
12. **Free Kicks**: Number of free kicks used.
13. **Saves**: Number of saves by the goal keeper.
14. **Pass Accuracy %**: Percentage of passes that reached the same team player as aimed.
15. **Passes**: Total number of passes by the team.
16. **Distance Covered(Kms)**: Total distance covered by the team members in this game.
17. **Fouls Committed**: Number of fouls committed by the team members.
18. **Yellow Card**: Number of Yellow warning received.
19. **Yellow & Red**: Number of Yellow & Red warning received.
20. **Red**: Number of Red cards received.
21. **Man of the Match**: Did this team member win Man of the Match?
22. **1st Goal**: When did the team score the 1st goal?
23. **Round**: Stage of the match
24. **PSO**: Was there a penalty shootout (PSO) in this match?
25. **Goals in PSO**: Number of goals scored in the Penalty shootout.
26. **Own goals**: Number of own goals.
27. **Own goal Time**: When did the team score own goal?

####  Detecting Any Outliers 

In [None]:
#Detecting Outliers
var1 = ['Goal Scored', 'On-Target', 'Corners', 'Attempts', 'Free Kicks', 'Yellow Card', 'Red', 'Fouls Committed']
dummy_data = data_file[var1]
plt.figure(figsize=(30,20))
sns.boxplot(data = dummy_data)
plt.title('Detecting Outliers')
plt.show()

In [None]:
def find_outliers(x):
    q1 = x.quantile(.25)
    q3 = x.quantile(.75)
    iqr = q3 - q1
    floor = q1 - 1.5*iqr
    ceiling = q3 + 1.5*iqr
    outlier_indices = list(x.index[(x < floor) | (x > ceiling)])
    outlier_values = list(x[outlier_indices])
    #data = data[~((data < (Q1 - 1.5 * IQR)) |(boston_df_o1 > (Q3 + 1.5 * IQR))).any(axis=1)]
    #boston_df_out.shape
    return outlier_indices, outlier_values

In [None]:
num_cols = ['Goal Scored','Ball Possession %', 'Attempts', 'On-Target', 'Off-Target',
           'Blocked', 'Corners', 'Offsides', 'Free Kicks',
           'Saves', 'Pass Accuracy %', 'Passes', 'Distance Covered (Kms)', 'Fouls Committed',
           'Yellow Card', 'Yellow & Red', 'Red', 'MoM', '1st Goal', 'Goals in PSO',
           'Own goals', 'Own goal Time', 'Result']

master_indices = []
for i in num_cols:
    indices, vals = find_outliers(data_file[i])
    master_indices.append(indices)
    print("Outliers for ", i)
    print("Total = ", len(vals))
    #data_file.drop(indices, inplace = True)
    print(np.sort(vals))

The outliers haven't been removed as the dataset is small.

#### **Summary of the dataset**

In [None]:
data_file.describe()

### **Insights Into The Data**

#### **Univariate Analysis**

In [None]:
plt.figure(figsize = (30,10))
Goals_sorted_sum = data_file.groupby('Team')['Goal Scored'].sum().reset_index().sort_values(by=('Goal Scored'), ascending=True)

sns.set(font_scale=3.5)
plot1 = sns.barplot(x='Team', y='Goal Scored', data=Goals_sorted_sum)

plot1.set_xticklabels(Goals_sorted_sum['Team'], rotation=90, ha="center")
plot1.set(xlabel='Teams',ylabel='Goals Scored')
plot1.set_title('Goals Scored by Teams')
plt.show()

The maximum number of goals scored were by Belgium. 

#### Features Distribution

In [None]:
def plot_cols(arr):
    plt.figure(figsize=(40,40))
    for i in range(len(arr)):
        plt.subplot(4,3,i+1)
        plt.title(arr[i])
        data_file[data_file['Result']==1][arr[i]].plot(kind='kde',legend=True,label='wins')
        data_file[data_file['Result']==-1][arr[i]].plot(kind='kde',legend=True,label='losses')
        data_file[data_file['Result']==0][arr[i]].plot(kind='kde',legend=True,label='draws')
        

In [None]:
plot_cols(['Attempts', 'Goal Scored', 'Off-Target', 'On-Target', 'Fouls Committed', 'Offsides', 'Ball Possession %', 'Passes','Corners','Pass Accuracy %','Distance Covered (Kms)','Saves'])

From the above distribution it is seen that Teams which have won have more Attempts, Off-Targets, Pass Accuracy, Saves. 

## Bivariate Analysis
### Correllation Plot

In [None]:
sub = ['Attempts', 'Goal Scored', 'Off-Target', 'On-Target', 'Fouls Committed', 'Offsides', 'Ball Possession %', 'Passes','Corners','MoM','Pass Accuracy %','Distance Covered (Kms)','Saves']
data = data_file[sub]
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.set(font_scale=3.5)
plt.subplots(figsize = (35,35))
sns.heatmap(data.corr(), 
            annot=True,
            cmap = cmap,
            linewidths=0.1, 
            linecolor='white',
            square=True)
plt.title("Correlations Among Features", y = 1.03,fontsize = 20)
plt.show()

From the Heat Map above, it can be inferred that: 

There is a correlation between **Ball Possession and Passes**. This is true as the more number of passes made by a team means that the ball is in their possession.

There is a correlation between **Number of Attempts and number of On-Target and Off-Target shots**. This correlation seems logical, the more number of attempts at scoring a goal will lead to more number of shots.

**Ball Possession and Pass Accuracy %** also seems to have a correlation as if the team has more possession of the ball they seem to have passed the ball more. More passes will obviously lead to higher Pass Accuracy %.

There also seems to be a correlation between **number of attempts and number of off-targets and on-targets** but those are similar so we discount that inference.

There seems to be a relation between **Ball Possession and Pass Accuracy**.

There is a correlation between **number of attempts and Corners**. This seems logical as a corner kick is the method of restarting play when the ball goes out of play over the goal line, without a goal being scored, and having last been touched by a member of the defending team. The kick is taken from the corner of the field of play nearest to where it went out. So if attempts were made which weren't successful then it would lead to more number of corner kicks.





### Relationship Between Ball Possession % and Passes

In [None]:
plt.figure(figsize = (15,15))
plt.scatter(x="Ball Possession %", y="Passes", data=data_file)
plt.xlabel("Ball Possession %")
plt.ylabel("Passes")
plt.title('Relationship Between Ball Possession % and Passes')
plt.show()

**The above scatter plot shows a very clean linear relationship between Ball Possession % and Passes. Thus, this hypothesis is validated. As the more number of passes made by a team means that the ball is in their possession.**

### Relationship between Attempts and Corners

In [None]:
plt.figure(figsize = (15,15))
plt.scatter(x="Attempts", y="Corners", data=data_file)
plt.xlabel("Attempts")
plt.ylabel("Corners")
plt.title("Relationship between Attempts and Corners")
plt.show()

**This scatter plot is not very clean this shows high variance but both the features do have a linear relationship.**

### Fair Play
Penalty Cards received by players of each team.
A Yellow Card is received by a player in order to discipline them.
Receival of two yellow cards in one game leads to the receival of a Red Card, which dismisses the player from the field for the remaining match and may ban them from playing the subsequent match.

#### Total Yellow Cards Received

In [None]:
yellow_cards_data = data_file.groupby('Team')['Yellow Card'].sum().reset_index().sort_values(by=('Yellow Card'), ascending=True)

plt.figure(figsize = (16, 10), facecolor = None)
sns.set_style("darkgrid")
sns.set(font_scale=1.5)
plot1 = sns.barplot(x="Team", y="Yellow Card", data=yellow_cards_data)

plot1.set_xticklabels(yellow_cards_data['Team'], rotation=90, ha="center")
plot1.set(xlabel='Teams',ylabel='Total yellow cards')
plot1.set_title('Total yellow cards')

#### Total Red Cards Received
As was seen from the initial data analysis. Only players from two teams have received Red Cards these are Columbia and Switzerland. They both got 1 red card each.

### Univariate Analysis

Total Man of the Match won by each team

In [None]:
mom_data = data_file.groupby('Team')['MoM'].sum().reset_index().sort_values(by=('MoM'),ascending=True)

plt.figure(figsize = (17, 10), facecolor = None)
sns.set_style("darkgrid")
sns.set(font_scale=1.5)
plot1 = sns.barplot(x="Team", y="MoM", data=mom_data)

plot1.set_xticklabels(mom_data['Team'], rotation=90, ha="center")
plot1.set(xlabel='Teams',ylabel='Total Man of the Matches')
plot1.set_title('Most Man of the Match awards')

Here it can be seen that Australia, Saudi Arabia,  Costa Rica, Panama and Morocco have won zero Man of the Match awards. Most were won by France, the team which eventually went onto win the world cup.

### RADAR CHART
This chart compares the performace of the two finalists: France and Croatia, throughout the tournament.

In [None]:
from math import pi

categories=np.array(['Fouls Committed', 'Ball Possession %', 'Goal Scored', 'Attempts', 'Blocked', 'On-Target', 'Off-Target', 'Corners', 'Blocked', 'Offsides', 'Saves', 'Distance Covered (Kms)', 'MoM'])
features_cumulative = data_file.groupby('Team')['Fouls Committed', 'Ball Possession %', 'Goal Scored', 'Attempts', 'Blocked', 'On-Target', 'Off-Target', 'Corners', 'Offsides', 'Saves', 'Distance Covered (Kms)', 'MoM'].sum().reset_index()

features_cumulative['Distance Covered (Kms)'] = features_cumulative['Distance Covered (Kms)']/10
data_file.groupby('Team').count()
features_cumulative['Ball Possession %'] = features_cumulative['Ball Possession %']/7
features_cumulative['MoM']=features_cumulative['MoM']*10

stats=features_cumulative.loc[10,categories].values 
stats=np.concatenate((stats,[stats[0]]))  #Closes the plot

# Setting an angle (we divide the plot / number of variable)
angles=np.linspace(0, 2*np.pi, len(categories), endpoint=False)   
angles=np.concatenate((angles,[angles[0]]))

# Initialising the radar plot
plot1 = plt.figure(figsize=(12, 12))
plot1 = plt.subplot(111, polar=True,)

plot1.set_theta_offset(pi / 2)
plot1.set_theta_direction(-1)
 
plt.xticks(angles[:], categories, size=20)
 
# Draw ylabels
plot1.set_rlabel_position(0)
plt.yticks([10,20,30, 40, 50, 60, 70 , 80 , 90 , 100, 110, 120, 130], ["10","20","30","40", "50", "60",  '70' , '80' , '90' , '100', '110', '120', '130'], color="grey", size=15)
plt.ylim(0,130)
 
#France plot
plot1.plot(angles, stats, linewidth=1, linestyle='solid', label="France")
plot1.fill(angles, stats, 'b', alpha=0.1)
 
#Croatia plot
stats=features_cumulative.loc[6,categories].values 
stats=np.concatenate((stats,[stats[0]]))
plot1.plot(angles, stats, linewidth=1, linestyle='solid', label="Croatia")
plot1.fill(angles, stats, 'r', alpha=0.1)  #alpha sets the opacity
 
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1), prop={'size': 25})

** From this radar chart it can be observed that cummulatively Croatia committed more fouls than France.
Croatia also made more attempts and eventually scored more goals than France.
It seems Croatia played in an offensive manner while France played in a defensive manner. **



## Man of the Match Prediction


#### Encoding the Categorical Variables

In [None]:
data_file.head()

In [None]:
Team = pd.get_dummies(data_file['Team'], prefix = 'Team', drop_first = True)
Opponent = pd.get_dummies(data_file['Opponent'], prefix = 'Opponent', drop_first = True)
Round = pd.get_dummies(data_file['Round'], prefix = 'Round', drop_first = True)
PSO = pd.get_dummies(data_file['PSO'], prefix = 'PSO', drop_first = True)

data_file.drop(['Team', 'Opponent', 'Round', 'PSO', 'Date'], inplace = True, axis = 1)
data_file = pd.concat([data_file,Team,Opponent, Round, PSO],axis=1)

In [None]:
data_file.head()

### Features Importance
Using Permutation Importance
For this we follow these steps:
1. Get a trained model
2. Shuffle the values in a single column, make predictions using the resulting dataset. Use these predictions and the true target values to calculate how much the loss function suffered from shuffling. That performance deterioration measures the importance of the variable you just shuffled.
3. Return the data to the original order (undoing the shuffle from step 2 and repeat step 2 with the next column in the dataset, until we have calculated the importance of each column.

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=None)

# X is the feature set and y is the target
y = data_file['MoM']
X = data_file.drop('MoM', axis=1)
#feature_names = [i for i in data_NoMoM.columns if data_NoMoM[i].dtype in [np.int64]]
#X = data_NoMoM[feature_names]

for train_index, test_index in skf.split(X,y): 
    #print("Train:", train_index, "Test:", test_index) 
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


In [None]:
from sklearn.tree import DecisionTreeClassifier

DTree_Model = DecisionTreeClassifier()
DTree_Model.fit(X_train, y_train)

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(DTree_Model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. 

The values towards the top are the most important features, and those towards the bottom matter least.

The first number in each row shows how much model performance decreased with a random shuffling. The number after the ± measures how performance varied from one-reshuffling to the next.

You'll occasionally see negative values for permutation importances. In those cases, the predictions on the shuffled data happened to be more accurate than the real data. This happens when the feature didn't matter (should have had an importance close to 0), but random chance caused the predictions on shuffled data to be more accurate. 

Here the Goal Scored is the most important feature, and that seems logical.


## Using Machine Learning Models to Predict Man of the Match

### Data Preparation
Data has already been split into Training and Test sets using stratified k-fold cross validation.

### Using The Decision Tree Classifier

Decision tree is a type of supervised learning algorithm.
It works for both categorical and continuous input and output variables. In this technique, we split the sample into two or more homogeneous sets based on most significant splitter / differentiator in input variables.

A decision tree is drawn upside down with its root at the top. An internal node is actually a condition based on which the tree splits into branches/ edges. The end of the branch that doesn’t split anymore is the decision/leaf.

In [None]:
DTree_Model = DecisionTreeClassifier()
DTree_Model.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=DTree_Model, X=X_train, y=y_train, cv=10)
dt_model_accuracy = accuracies.mean()
dt_model_standard_deviation = accuracies.std()
print("model accuracy", dt_model_accuracy)
print("model standard deviation", dt_model_standard_deviation)

In [None]:
confusion_matrix(y_test, DTree_Model.predict(X_test))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

area_under_curve = roc_auc_score(y_test, DTree_Model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, DTree_Model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % area_under_curve)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()

Recall : Proportion of correct positive classification (True positives) from cases that are actually positive. <br>
Recall = TP/(TP+FN) = True Positive/ Actual Positives<br>
<br>
Precision : Proportion of correct positive classification (True positives) from cases that are predicted as positive. <br>
Precision = TP/(TP+FP) = True Positive/ Predicted Positives <br>
<br>
F1-Score : The F1 score conveys the balance between the precision and the recall. It tells you how precise your classifier is (how many instances it classifies correctly), as well as how robust it is (it does not miss a significant number of instances).<br>
F1-Score = 2x((precision x recall)/(precision+recall))



In [None]:
from sklearn import metrics
predicted = DTree_Model.predict(X_test)
print(metrics.classification_report(y_test, predicted))

### Using The Random Forest Classifier

Random Forest is a supervised learning algorithm. It is an ensemble of Decision Trees. It uses the bagging method which is that a combination of learning models increases the overall result.


In [None]:
RForest_Model = RandomForestClassifier()
RForest_Model.fit(X_train, y_train)

accuracies = cross_val_score(estimator=RForest_Model, X=X_train, y=y_train, cv=10)
rf_model_accuracy = accuracies.mean()
rf_model_standard_deviation = accuracies.std()
print("model accuracy", rf_model_accuracy)
print("model standard deviation", rf_model_standard_deviation)

In [None]:
confusion_matrix(y_test, RForest_Model.predict(X_test))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

area_under_curve = roc_auc_score(y_test, RForest_Model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, RForest_Model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % area_under_curve)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()

In [None]:
from sklearn import metrics
predicted = RForest_Model.predict(X_test)
print(metrics.classification_report(y_test, predicted))

### Using The Gradient Boosting Classifier

The idea of boosting came out of the idea of whether a weak learner can be modified to become better.
A weak hypothesis or weak learner is defined as one whose performance is at least slightly better than random chance.

We go through cycles that repeatedly builds new models and combines them into an ensemble model. We start the cycle by calculating the errors for each observation in the dataset. We then build a new model to predict those. We add predictions from this error-predicting model to the "ensemble of models."

To make a prediction, we add the predictions from all previous models. We can use these predictions to calculate new errors, build the next model, and add it to the ensemble.



In [None]:
from xgboost import XGBClassifier
GBoost_Model = XGBClassifier(learning_rate = 0.05, n_estimators= 100, max_depth = 4)
GBoost_Model.fit(X_train, y_train)

accuracies = cross_val_score(estimator=GBoost_Model, X=X_train, y=y_train, cv=10)
xgb_model_accuracy = accuracies.mean()
xgb_model_standard_deviation = accuracies.std()
print("model accuracy", xgb_model_accuracy)
print("model standard deviation", xgb_model_standard_deviation)

In [None]:
confusion_matrix(y_test, GBoost_Model.predict(X_test))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

area_under_curve = roc_auc_score(y_test, GBoost_Model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, GBoost_Model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGboost (area = %0.2f)' % area_under_curve)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()

In [None]:
from sklearn import metrics
predicted = GBoost_Model.predict(X_test)
print(metrics.classification_report(y_test, predicted))

### Using SVC

-SVM can be used to do binary classification <br>
-SVM finds a hyper-plane (line in 2d, plane in 3d, etc) that separates its training data in such a way that the distance between the hyper plane and the closest points from each class is maximized. <br>
-once SVM finds this hyper-plane, you can classify new data points by seeing which side of this hyper-plane they land on <br>
-SVM can only be used on data that is linearly separable (i.e. a hyper-plane can be drawn between the two groups)

#### y = w<sup>T</sup> + b
y : classification label
w : parameters of the plane
b : moves hyperplane in and out of the origin

Hyperplane is also known as the decision boundary, it separates the two classes of data. While also having the property of being the most further away from both classes.
Equation for the hyperplane is <br>w<sup>T</sup> + b = 0
![](https://cdn-images-1.medium.com/max/1600/1*TudH6YvvH7-h5ZyF2dJV2w.jpeg)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
X_train_scaled.head()

In [None]:
from sklearn.svm import SVC
svc_cls = SVC()
svc_cls.fit(X_train_scaled, y_train)
svc_accuracy = accuracy_score(y_test, svc_cls.predict(X_test_scaled))
print('accuracy score', svc_accuracy)

In [None]:
print('confusion matrix')
print(confusion_matrix(y_test, svc_cls.predict(X_test_scaled)))

In [None]:
from sklearn import metrics
predicted = svc_cls.predict(X_test_scaled)
print(metrics.classification_report(y_test, predicted))

### Plotting the Accuracy Percentage of all the Classifiers

In [None]:
plt.figure(figsize = (25,10))

arr = np.array((dt_model_accuracy*100))
arr = np.append(arr, (rf_model_accuracy*100))
arr = np.append(arr, (xgb_model_accuracy*100))
arr = np.append(arr, (svc_accuracy*100))
ser = pd.Series(arr)
print('Accuracy for Decision Tree', arr[0])
print('Accuracy for Random Forest', arr[1])
print('Accuracy for Gradient Boost Classifier', arr[2])
print('Accuracy for Support Vector Classifier', arr[3])

sns.set(font_scale=2.2)
sns.set_style("darkgrid")
Models_labels = ['Decision Trees', 'Random Forest', 'Gradient Boosting Classifier', 'Support Vector Classifier']
plot1 = sns.barplot(x=Models_labels, y=ser)

plot1.set_xticklabels(Models_labels, ha="center")
plot1.set(xlabel='Models',ylabel='Accuracy')
plot1.set_title('Accuracy % of Each Model')
plt.show()