In [None]:
import pandas as pd
import numpy as np


In [None]:
np.random.seed(42)

fast_ball_speed = np.random.randint(60, 100, 500)
tomy_john = np.where(fast_ball_speed > 80, np.random.choice([0,1], size=500, p=[0.3, 0.7]), 0)

In [None]:
data = {'fast_ball_speed': fast_ball_speed, 'tomy_john': tomy_john}

df = pd.DataFrame(data)
print(df.head())

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop('tomy_john', axis=1)
y = df['tomy_john']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# train model 
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train.values.ravel())

In [6]:
# predict
y_pred = model.predict(X_test)

score = model.score(X_test, y_test)
print(score)

0.79


In [8]:
# run test again with diff random seed
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=15)

model.fit(X_train2, y_train2.values.ravel())
y_pred2 = model.predict(X_test2)
score = model.score(X_test2, y_test2)
print(score)

0.76


In [11]:
# basic cross validation

from sklearn.model_selection import cross_val_score

cvs_score = cross_val_score(model, X, y.values.ravel(), cv=10) # 10 fold cross validation, its' means that the data will be split into 10 parts
avg_score = np.average(cvs_score)
std = np.std(cvs_score)
print('Scores: ', cvs_score, 'Average Score: ', avg_score, 'Std: ', std)

Scores:  [0.86 0.82 0.74 0.86 0.86 0.82 0.7  0.72 0.74 0.84] Average Score:  0.796 Std:  0.06053098380168622


In [12]:
# kfold cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=15, shuffle=True, random_state=42)

kf_score = cross_val_score(model, X, y.values.ravel(), cv=kf, scoring='accuracy')
avg_kf_score = np.average(kf_score)

kf_score2 = cross_val_score(model, X, y.values.ravel(), cv=kf, scoring='f1')
avg_score2 = np.average(kf_score2)

print('Scores: ', kf_score, 'Average Score: ', avg_kf_score)
print('Scores2: ', kf_score2, 'Average Score2: ', avg_score2)

Scores:  [0.73529412 0.85294118 0.79411765 0.85294118 0.88235294 0.81818182
 0.72727273 0.78787879 0.78787879 0.84848485 0.84848485 0.78787879
 0.66666667 0.66666667 0.87878788] Average Score:  0.7957219251336897
Scores2:  [0.66666667 0.83870968 0.72       0.8        0.8        0.72727273
 0.70967742 0.75862069 0.69565217 0.82758621 0.83870968 0.69565217
 0.47619048 0.56       0.84615385] Average Score2:  0.730726115657005


In [14]:
# stratified kfold cross validation
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=11)

skf_score = cross_val_score(model, X, y.values.ravel(), cv=skf)

avg_skf_score = np.average(skf_score)
srd_skf_score = np.std(skf_score)

print('Scores: ', skf_score, 'Average Score: ', avg_skf_score, 'Std: ', srd_skf_score)

Scores:  [0.82 0.88 0.76 0.72 0.76 0.76 0.8  0.84 0.92 0.7 ] Average Score:  0.796 Std:  0.06621178142898741


In [15]:
# pipeline

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), LogisticRegression())

pipe.fit(X_train, y_train.values.ravel())

score_pipe = cross_val_score(pipe, X, y.values.ravel(), cv=10)

avg_kf_score = np.average(score_pipe)

print('Scores: ', score_pipe, 'Average Score: ', avg_kf_score)

Scores:  [0.86 0.82 0.74 0.86 0.86 0.82 0.7  0.72 0.74 0.84] Average Score:  0.796
