In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline

In [21]:
np.random.seed(42)
fastball_speed = np.random.randint(90, 106, size=500)
tommy_john = np.where(fastball_speed>96, np.random.choice([0, 1], size=500, p=[0.3, 0.7]), 0) # probability of selecting 0 is 0.3 (30%), and the probability of selecting 1 is 0.7 (70%).

data = {'fastball_speed' : fastball_speed,
        'tommy_john': tommy_john}

df = pd.DataFrame(data)

print(df.head())

x = df[['fastball_speed']]
y = df[['tommy_john']]
LR = LogisticRegression()
# test1:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)

LR.fit(x_train, y_train.values.ravel())

print('\ntest1 - Score: \n', LR.score(x_test, y_test))

# test2:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.2, random_state=25)

LR.fit(x_train2, y_train2.values.ravel())

print('\ntest2 - Score: \n', LR.score(x_test2, y_test2))

''' Here we can see some variance in the 1st and 2nd tests '''

cross_validation = cross_val_score(LR, x, y.values.ravel(), cv=10)
print('\n\ncross_validation:', cross_validation)

''' Cross-validation and KFold are both the same '''

KF = KFold(n_splits=10, shuffle=True, random_state=42)
KFscore1 = cross_val_score(LR, x, y.values.ravel(), cv=KF, scoring='f1')
KFscore2 = cross_val_score(LR, x, y.values.ravel(), cv=KF, scoring='accuracy')

print(f'''
KFscore1: {KFscore1.round(2)} \n
KFscore2: {KFscore2.round(2)}''')


   fastball_speed  tommy_john
0              96           0
1              93           0
2             102           1
3             104           1
4             100           0

test1 - Score: 
 0.71

test2 - Score: 
 0.8


cross_validation: [0.68 0.78 0.72 0.72 0.78 0.82 0.74 0.74 0.76 0.82]

KFscore1: [0.74 0.72 0.7  0.72 0.77 0.65 0.63 0.65 0.65 0.63] 

KFscore2: [0.82 0.8  0.68 0.78 0.82 0.76 0.74 0.76 0.68 0.72]


## adding the pipeline

In [22]:
Scaler = StandardScaler()

pipe1 = make_pipeline(Scaler, LR)
pipe1.fit(x, y.values.ravel())

cross_validation1 = cross_val_score(pipe1,x, y.values.ravel(), cv=10)
cross_validation1

array([0.68, 0.78, 0.72, 0.72, 0.78, 0.82, 0.74, 0.74, 0.76, 0.82])