In [1]:
import pandas as pd 

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [82]:
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
raw_path = os.path.join(BASE_DIR, "data", "raw", "500hits.csv")
df = pd.read_csv(raw_path, encoding="latin1")

In [83]:
# PLAYER - name of the player
# YRS - total years played
# G - games played
# AB - at-bats (official batting attempts)
# R - runs scored
# H - total hits (all types)
# 2B - doubles
# 3B - triples
# HR - home runs
# RBI - runs batted in
# BB - walks (base on balls)
# SO - strikeouts
# SB - stolen bases
# CS - caught stealing attempts
# BA - batting average (H/AB)
# HOF - Hall of Fame indicator (1 = inducted)

In [84]:
df = df.drop(columns=['PLAYER', 'CS'])
df.head(20)


Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366,1
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331,1
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345,1
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.31,1
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329,1
5,23,3308,11988,1816,3419,646,59,452,1844,1845,1393,168,0.285,1
6,21,2683,10835,1782,3319,605,114,234,1307,1094,1244,504,0.306,1
7,25,2826,9949,1821,3315,438,187,47,520,1499,286,744,0.333,1
8,22,2992,10881,2062,3283,523,140,660,1903,1464,1526,338,0.302,1
9,21,3026,11336,1627,3255,560,35,504,1917,1333,1516,110,0.287,1


In [85]:
df.describe()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
count,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0
mean,17.049462,2048.698925,7511.455914,1150.313978,2170.247312,380.952688,78.554839,201.049462,894.260215,783.56129,847.470968,195.905376,0.288712,0.329032
std,2.765186,354.391805,1294.065992,289.635071,424.190773,96.48346,49.36303,143.622664,486.193456,327.43195,489.224289,181.845543,0.021208,0.474928
min,11.0,1331.0,4981.0,601.0,1660.0,177.0,3.0,9.0,0.0,239.0,0.0,7.0,0.246,0.0
25%,15.0,1802.0,6523.0,936.0,1838.0,312.0,41.0,79.0,640.0,535.0,436.0,63.0,0.273,0.0
50%,17.0,1993.0,7241.0,1104.0,2076.0,366.0,67.0,178.0,968.0,736.0,825.0,137.0,0.287,0.0
75%,19.0,2247.0,8180.0,1296.0,2375.0,436.0,107.0,292.0,1206.0,955.0,1226.0,285.0,0.3,1.0
max,26.0,3308.0,12364.0,2295.0,4189.0,792.0,309.0,755.0,2297.0,2190.0,2597.0,1406.0,0.366,2.0


In [86]:
X = df.iloc[:,0:13]
y = df.iloc[:,13]

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 11, test_size = 0.2)

In [None]:
scaler = MinMaxScaler(feature_range=(0,1)) 
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [90]:
knn = KNeighborsClassifier(n_neighbors = 8)
knn.fit(X_train, y_train)


0,1,2
,n_neighbors,8
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [95]:
y_pred = knn.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1])

In [99]:
knn.score(X_test, y_test)

0.8279569892473119

In [101]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[55, 12],
       [ 4, 22]])

In [103]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87        67
           1       0.65      0.85      0.73        26

    accuracy                           0.83        93
   macro avg       0.79      0.83      0.80        93
weighted avg       0.85      0.83      0.83        93

