In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [2]:
URL ="https://raw.githubusercontent.com/Aditya1001001/English-Premier-League/master/pos_modelling_data.csv"

data= pd.read_csv(URL)
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1793 entries, 0 to 1792
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Position             1793 non-null   object 
 1   Clean sheets         1793 non-null   float64
 2   Goals conceded       1793 non-null   float64
 3   Tackles              1793 non-null   float64
 4   Tackle success %     1793 non-null   int64  
 5   Blocked shots        1793 non-null   float64
 6   Interceptions        1793 non-null   float64
 7   Clearances           1793 non-null   float64
 8   Recoveries           1793 non-null   float64
 9   Successful 50/50s    1793 non-null   float64
 10  Own goals            1793 non-null   float64
 11  Assists              1793 non-null   int64  
 12  Passes               1793 non-null   int64  
 13  Passes per match     1793 non-null   float64
 14  Big chances created  1793 non-null   float64
 15  Crosses              1793 non-null   f

In [3]:
data.head()

Unnamed: 0,Position,Clean sheets,Goals conceded,Tackles,Tackle success %,Blocked shots,Interceptions,Clearances,Recoveries,Successful 50/50s,...,Shots,Shooting accuracy %,Saves,Penalties saved,age,value_eur,overall,Arial Saves,Duels %,Aerial battles %
0,Midfielder,0.0,0.0,4.0,100,0.0,1.0,0.0,9.0,4.0,...,2.0,50,0.0,0.0,21,4400000,72,0.0,46.153846,25.0
1,Defender,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0.0,22,10500000,77,0.0,0.0,0.0
2,Forward,0.0,0.0,10.0,0,11.0,1.0,19.0,0.0,0.0,...,42.0,36,0.0,0.0,19,7500000,73,0.0,0.0,0.0
3,Midfielder,0.0,0.0,9.0,56,3.0,9.0,14.0,40.0,12.0,...,10.0,30,0.0,0.0,31,4800000,74,0.0,55.384615,58.333333
4,Midfielder,0.0,0.0,22.0,59,5.0,14.0,0.0,58.0,6.0,...,11.0,18,0.0,0.0,28,0,83,0.0,40.869565,36.666667


In [4]:
X = data.drop( 'Position', axis=1)
y= data['Position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 1)

In [5]:
rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
rf_all_features.fit(X_train, y_train) 
accuracy_score(y_test, rf_all_features.predict(X_test))

0.7298050139275766

In [9]:
# Monkey patching np.int to int
np.int = int
np.float=float
np.bool=bool
rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
boruta_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	29
Tentative: 	5
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	29
Tentative: 	3
Rejected: 	2
Iteration: 	10 / 100
Confirmed: 	29
Tentative: 	3
Rejected: 	2
Iteration: 	11 / 100
Confirmed: 	29
Tentative: 	3
Rejected: 	2
Iteration: 	12 / 100
Confirmed: 	30
Tentative: 	2
Rejected: 	2
Iteration: 	13 / 100
Confirmed: 	30
Tentative: 	2
Rejected: 	2
Iteration: 	14 / 100
Confirmed: 	30
Tentative: 	2
Rejected: 	2
Iteration: 	15 / 100
Confirmed: 	30
Tentative: 	2
Rejected: 	2
Iteration: 	16 / 100
Confirmed: 	30
Tentative: 	2
Rejected: 	2
I

In [10]:
print("ranking:", boruta_selector.ranking_)
print("no of significant features:",boruta_selector.n_features_)

ranking: [1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 2 1 1 1 1 1 1]
no of significant features: 31


In [12]:
selected_rf_features = pd.DataFrame({'Features':list(X_train.columns),'Ranking':boruta_selector.ranking_})
selected_rf_features.sort_values(by='Ranking')

Unnamed: 0,Features,Ranking
0,Clean sheets,1
31,Arial Saves,1
30,overall,1
29,value_eur,1
28,age,1
26,Saves,1
25,Shooting accuracy %,1
24,Shots,1
23,Goals per match,1
22,Goals,1


In [17]:
X_imp_train=boruta_selector.transform(np.array(X_train))
X_imp_test=boruta_selector.transform(np.array(X_test))

In [18]:
rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
rf_boruta.fit(X_imp_train , y_train)

In [20]:
accuracy_score(y_test,rf_boruta.predict(X_imp_test))

0.7325905292479109