In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (train_test_split, cross_val_score, 
                                     GridSearchCV, RandomizedSearchCV)
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


In [3]:
raw_df = pd.read_csv('stats.csv')

In [4]:
df = raw_df.copy()
df.drop('last_name, first_name',axis = 1,inplace=True)
df.drop('player_id',axis = 1,inplace=True)

CREATING A CLASSIFICATION PROBLEM 
BINNING OBPS 
An OPS scale

According to https://en.wikipedia.org/wiki/On-base_plus_slugging
"Bill James, in his essay titled "The 96 Families of Hitters"[4] uses seven different categories for classification by OPS:

| Category | Classification	| OPS range       |
|----------|----------------|-----------------|
|A         |	Great	    |.9000 and higher |
|B         |	Very good	|.8334 to .8999   |
|C         |Above average   |.7667 to .8333   |
|D         |	Average	    | .7000 to .7666  |
|E         |Below average	|.6334 to .6999   |
|F         |	Poor	    |.5667 to .6333   |
|G         |	Very poor   |.5666 and lower  |




I have created a scale such that we are classifying below, average and above average hitters based on their ops
The categories for classification by OPS I have created: 


| Category | Classification	| OPS range       |
|----------|----------------|-----------------|
|1         |At least Average|.7000 and higher
|0         |Below average	|.6999  and lower |


In [5]:
#new dataset for classification
c_df = df.copy()

# Define the OPS ranges and corresponding classifications
ops_ranges = {
    1: (0.7000, float('inf')),
    0: (-float('inf'), 0.6999)
}

# Create an empty list to store the classifications
classifications = []

for ops_value in c_df['on_base_plus_slg']:
    for classification, (lower_bound, upper_bound) in ops_ranges.items():
        if lower_bound <= ops_value < upper_bound:
            classifications.append(classification)
            break
    else:
        # If the OPS value didn't fall within any range, append a default value
        classifications.append('Unknown')

# Add the classifications to your dataset as a new column
c_df['Classification'] = classifications



#double checking that there are no missing or duplicated values for saftey 

if c_df.isna().any().any():
    print("There are missing values in the classification dataframe c_df.")
else:
    print("There are no missing values in the classification dataframe c_df")

if c_df.duplicated().any().any():
    print("There are duplicated values in the classification dataframe c_df.")
else:
    print("There are no duplicated values in the classification dataframe c_df")

c_df.head(n=5)

There are no missing values in the classification dataframe c_df
There are no duplicated values in the classification dataframe c_df


Unnamed: 0,year,pa,hit,single,double,triple,home_run,k_percent,bb_percent,on_base_plus_slg,...,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,swing_percent,groundballs_percent,flyballs_percent,Classification
0,2020,231,51,37,4,0,10,22.1,10.4,0.746,...,9.7,5.8,49.7,102.655113,96.026886,31.6,47.7,42.6,21.9,1
1,2020,214,56,34,6,0,16,27.1,11.7,0.992,...,15.0,3.9,47.2,102.72368,95.933078,34.2,47.6,47.2,21.3,1
2,2020,218,61,45,10,1,5,20.6,6.0,0.772,...,5.0,6.3,36.3,100.556637,94.354591,21.1,46.6,51.2,18.8,1
3,2020,209,49,31,10,1,7,18.7,5.3,0.722,...,11.5,3.8,45.2,101.53026,95.520896,21.0,45.0,50.3,25.5,1
4,2020,213,46,26,9,3,8,18.8,8.9,0.752,...,6.5,8.4,38.3,97.982869,93.323023,20.5,46.1,45.5,26.6,1


## Split into three datasets: train,test,split

In [6]:
#train test split datasets

c_train_df = c_df[(c_df['year'] == 2021) | (c_df['year'] == 2020)]
c_val_df = c_df[c_df.year==2022]
c_test_df = c_df[c_df.year==2023]


## Define X_train and y_train

In [7]:
features = ['k_percent','exit_velocity_avg','sweet_spot_percent', 'barrel_batted_rate','solidcontact_percent', 'hard_hit_percent', 
          'avg_best_speed','avg_hyper_speed',
          'whiff_percent', 'swing_percent',
          'groundballs_percent', 'flyballs_percent']

X_train = c_train_df[features]
y_train = c_train_df.Classification
Counter(y_train)

Counter({1: 235, 0: 39})

## Standard Scaling X

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # do scaling
X_train_scaled

array([[ 0.13966454,  1.66076744,  0.65087111, ...,  0.16584048,
        -0.06522197, -0.6694329 ],
       [ 0.97013514,  0.96417306,  1.25675191, ...,  0.14548192,
         0.59189854, -0.77387892],
       [-0.10947663, -0.08071851, -1.07355887, ..., -0.05810361,
         1.16330768, -1.20907066],
       ...,
       [-0.17591428, -0.2548671 , -0.42107185, ..., -0.09882071,
         0.04905985,  0.18354291],
       [-1.35518252, -1.12561008, -1.49301481, ..., -0.3634819 ,
         2.13470321, -1.53981638],
       [ 0.48846219,  1.66076744,  0.74408354, ..., -0.77065296,
        -0.72234248,  0.75799601]])

## Addressing class imbalance by oversampling

In [9]:
# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=0)

# Resample the dataset
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_train_resampled)

Counter({1: 235, 0: 235})

## Training Model

In [11]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_resampled, y_train_resampled)
print(f'The accuracy on the scaled, resampled training data is: {clf.score(X_train_resampled,y_train_resampled):.3}.')

The accuracy on the scaled, resampled training data is: 0.779.


## Defining Validation Data, Tranforming the X_val data by the Standard Scaler that was fit to the X_train data and making predictions on the scaled X_Val
## Accuracy and precision on the validation data

In [12]:
#defining X_val and y_val 
X_val = c_val_df[features]
y_true_val = c_val_df.Classification


# Transform validation data using the same scaler the scaler was fit to the trainning data now it is transformer the val data in the same way
X_val_scaled = scaler.transform(X_val)

# Make predictions on validation data
y_hat_val = clf.predict(X_val_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_true_val, y_hat_val)

# Evaluate precision
precision = precision_score(y_true_val, y_hat_val, average='weighted')
print(f'accuracy: {accuracy:.3}, precision: {precision:.3}.')


accuracy: 0.792, precision: 0.837.


## Grid Search

In [29]:
parameters = [{'max_iter': [1000, 5000, 10000], 'C': [0.01, 1, 10, 1000], 'penalty': ['l2']}]
model = linear_model.LogisticRegression(multi_class='multinomial')
clf = GridSearchCV(model, parameters)
clf.fit(X_train_resampled, y_train_resampled)

print(f'clf.best_score_={clf.best_score_:.3}, ' +
      f'clf.best_params_={clf.best_params_}')


clf.best_score_=0.791, clf.best_params_={'C': 1000, 'max_iter': 1000, 'penalty': 'l2'}


In [30]:
model = linear_model.LogisticRegression(multi_class = 'multinomial',C=1000,max_iter=1000,penalty='l2')
model.fit(X_train_resampled, y_train_resampled)
model.score(X_train_resampled, y_train_resampled)
#model.score(X_val_scaled,y_true_val)
print(f'The accuracy on the scaled, resampled training data is: {model.score(X_train_resampled,y_train_resampled):.3}.')

The accuracy on the scaled, resampled training data is: 0.804.


In [31]:
#defining X_val and y_val 
X_val = c_val_df[features]
y_true_val = c_val_df.Classification


# Transform validation data using the same scaler the scaler was fit to the trainning data now it is transformer the val data in the same way
X_val_scaled = scaler.transform(X_val)

# Make predictions on validation data
y_hat_val = model.predict(X_val_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_true_val, y_hat_val)

# Evaluate precision
precision = precision_score(y_true_val, y_hat_val, average='weighted')
print(f'accuracy: {accuracy:.3}, precision: {precision:.3}.')


accuracy: 0.785, precision: 0.834.


In [None]:
# row_data = raw_df.iloc[13:].copy()
# row_data = row_data
# fun = pd.DataFrame(row_data)
# fun
# x = scaler.transform(fun[features])
# x
# # model.predict(x)
# print(f'predicted = {model.predict(x)}.')
# print(f'actual = {row_data.on_base_plus_slg}.')

# # fun = fun[features]
# # fun_numeric = fun.apply(pd.to_numeric)


# # X_fun = fun_numeric[xfeatures]



# # prediction = model.predict(X_fun)
# # predicted_obps = prediction[0] 


# # actual_obps = fun_numeric['on_base_plus_slg'].iloc[0]

# # print(f'The predicted obps is {predicted_obps:.3} and the actual obps is {actual_obps}.')