In [1]:
import time
start_time = time.time()
import numpy as np # linear algebra
import pandas as pd #data processing, CSV file I/O
import matplotlib.pyplot as plt #for data visualization purposes

In [2]:
data = './dataPreprocesing/fixedData.csv'

In [3]:
import warnings

warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv(data, header = None, sep = ',')

In [5]:
#view dimensions of dataset
df.shape

(338, 5)

In [6]:
#view top 5 rows
df.head()

Unnamed: 0,0,1,2,3,4
0,218.6,11.6,13.8,4.8,0
1,245.6,13.8,11.6,4.8,1
2,206.6,15.2,17.4,4.0,0
3,233.0,17.4,15.2,6.2,1
4,208.0,15.4,16.8,3.8,0


In [7]:
#rename column names
col_names = ['ACS','K', 'D', 'A','Win']
df.columns = col_names
df.columns

Index(['ACS', 'K', 'D', 'A', 'Win'], dtype='object')

In [8]:
df.head()

Unnamed: 0,ACS,K,D,A,Win
0,218.6,11.6,13.8,4.8,0
1,245.6,13.8,11.6,4.8,1
2,206.6,15.2,17.4,4.0,0
3,233.0,17.4,15.2,6.2,1
4,208.0,15.4,16.8,3.8,0


In [9]:
df.info() ##Summary of dataset
#conclusion: no missing values in the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ACS     338 non-null    float64
 1   K       338 non-null    float64
 2   D       338 non-null    float64
 3   A       338 non-null    float64
 4   Win     338 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 13.3 KB


In [10]:
print(df)

       ACS     K     D    A  Win
0    218.6  11.6  13.8  4.8    0
1    245.6  13.8  11.6  4.8    1
2    206.6  15.2  17.4  4.0    0
3    233.0  17.4  15.2  6.2    1
4    208.0  15.4  16.8  3.8    0
..     ...   ...   ...  ...  ...
333  195.0  12.2  16.8  2.4    0
334  206.0  18.0  18.8  3.8    0
335  213.0  18.8  18.0  6.6    1
336  218.6  17.0  17.0  3.8    1
337  223.4  17.0  17.0  5.8    0

[338 rows x 5 columns]


In [11]:
# find numerical variables

numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

There are 5 numerical variables

The numerical variables are : ['ACS', 'K', 'D', 'A', 'Win']


In [12]:
df[numerical].isnull().sum()

ACS    0
K      0
D      0
A      0
Win    0
dtype: int64

# Splitting the data

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
X = df.iloc[:,0:4] #every row&column, except the outcome column
y = df.iloc[:,4] #only the outcome
X_train , X_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.2)

# Feature Scaling the data
##### Feature scaling is the process of normalizing or standardizing the features in a dataset

In [14]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [15]:
import math
math.sqrt(len(y_test)) #we want an odd number of neighbors, so 7 will be used

8.246211251235321

In [16]:
#Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=7, p=2 , metric='euclidean')

In [17]:
#Fit model
classifier.fit(X_train, y_train)

In [18]:
# import xgboost as xgb
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dval = xgb.DMatrix(X_test, label=y_test)
# params = {
#     'max_depth': 3,
#     'eta': 0.1,
#     'objective': 'binary:logistic',
#     'eval_metric': 'logloss'
# }
# model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'validation')])
# predictions = model.predict(xgb.DMatrix(X_test))


# Evaluate model using:
#### Confusion Matrix, F1 score, and accuracy score

In [19]:
#Testing the model:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0], dtype=int64)

In [20]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://www.simplilearn.com/ice9/free_resources_article_thumb/confusion-matrix.JPG")

In [21]:
#Evaluate Model
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[36  0]
 [ 5 27]]


In [22]:
print(f1_score(y_test,y_pred)) #Takes into account false positive

0.9152542372881356


In [23]:
print(accuracy_score(y_test, y_pred)) #looks at how many we got right from the total population/sample

0.9264705882352942


# F1 Score is of ~92%
# Accuracy of the Model is of ~93%

In [24]:
print("--- %s seconds ---" % (time.time() - start_time))


--- 1.7773308753967285 seconds ---
