In [76]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [77]:
import pandas as pd
tennis_data=pd.read_csv("tennis.csv")

In [78]:
tennis_data.columns

Index(['rally', 'serve', 'hitpoint', 'speed', 'net.clearance',
       'distance.from.sideline', 'depth', 'outside.sideline',
       'outside.baseline', 'player.distance.travelled', 'player.impact.depth',
       'player.impact.distance.from.center', 'player.depth',
       'player.distance.from.center', 'previous.speed',
       'previous.net.clearance', 'previous.distance.from.sideline',
       'previous.depth', 'opponent.depth', 'opponent.distance.from.center',
       'same.side', 'previous.hitpoint', 'previous.time.to.net',
       'server.is.impact.player', 'outcome', 'gender', 'ID'],
      dtype='object')

In [79]:
tennis_data.dtypes

rally                                   int64
serve                                   int64
hitpoint                               object
speed                                 float64
net.clearance                         float64
distance.from.sideline                float64
depth                                 float64
outside.sideline                         bool
outside.baseline                         bool
player.distance.travelled             float64
player.impact.depth                   float64
player.impact.distance.from.center    float64
player.depth                          float64
player.distance.from.center           float64
previous.speed                        float64
previous.net.clearance                float64
previous.distance.from.sideline       float64
previous.depth                        float64
opponent.depth                        float64
opponent.distance.from.center         float64
same.side                                bool
previous.hitpoint                 

In [80]:
 tennis_data.drop('ID',axis=1,inplace=True)

In [81]:
tennis_data

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,outcome,gender
0,4,1,B,35.515042,-0.021725,3.474766,6.797621,False,False,1.467570,...,2.449182,0.705435,12.5628,2.0724,True,F,0.445318,False,UE,mens
1,4,2,B,33.382640,1.114202,2.540801,2.608708,False,True,2.311931,...,0.583291,3.856600,12.3544,5.1124,False,B,0.432434,False,FE,mens
2,23,1,B,22.316690,-0.254046,3.533166,9.435749,False,False,3.903728,...,1.115250,2.908892,13.8620,1.6564,False,F,0.397538,True,FE,mens
3,9,1,F,36.837309,0.766694,0.586885,3.342180,True,False,0.583745,...,3.256695,0.557554,14.2596,0.1606,True,B,0.671984,True,UE,mens
4,4,1,B,35.544208,0.116162,0.918725,5.499119,False,False,2.333456,...,1.431146,3.945317,11.3658,1.1082,False,F,0.340411,False,W,mens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7996,4,2,F,36.693201,0.186101,0.457670,4.026966,False,False,3.068317,...,3.283354,3.106719,12.5438,1.5368,False,F,0.684150,False,W,womens
7997,5,2,B,33.505322,0.985032,2.767406,0.275569,False,True,3.747929,...,0.422930,3.674636,13.5470,0.9754,False,F,0.620857,True,UE,womens
7998,10,2,F,16.906289,0.674401,0.525043,2.005257,False,False,7.931349,...,1.897760,9.208275,11.6748,2.0146,False,B,0.966186,False,W,womens
7999,6,2,F,15.199713,0.936015,3.365840,1.459378,False,False,11.454605,...,0.421844,8.975572,6.9750,1.2324,False,B,0.887608,False,W,womens


In [82]:
X = tennis_data.drop('outcome',axis=1)
y = tennis_data['outcome']


In [83]:
cat_cols=X.select_dtypes(include=["object","category","bool"]).columns

In [84]:
num_cols = X.select_dtypes(include=['float64', 'int64']).columns

In [85]:
X[cat_cols]=X[cat_cols].astype("category")


In [86]:
y

0       UE
1       FE
2       FE
3       UE
4        W
        ..
7996     W
7997    UE
7998     W
7999     W
8000    UE
Name: outcome, Length: 8001, dtype: object

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.30, random_state=123)


In [88]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit_transform(y_train)

y_train=le.transform(y_train)
y_test=le.transform(y_test)

In [89]:
numeric_transformer = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)
categorical_tranformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_tranformer, cat_cols)
    ]
)

In [90]:
RF_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [91]:
RF_pipe.fit(X_train,y_train)

In [92]:
from sklearn import set_config
set_config(display='diagram')

In [93]:
train_pred_RFPipe = RF_pipe.predict(X_train)
train_pred_RFPipe

array([1, 2, 0, ..., 1, 1, 0])

In [94]:
test_pred_RFPipe = RF_pipe.predict(X_test)
test_pred_RFPipe

array([1, 2, 2, ..., 2, 1, 1])

In [95]:
def evaluate_model(act, pred):
    from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
    print("Confusion Matrix \n", confusion_matrix(act, pred,))
    print("Accurcay : ", accuracy_score(act, pred))
    print("Recall   : ", recall_score(act, pred,average="weighted"))
    print("Precision: ", precision_score(act, pred,average="weighted"))    

In [96]:
print("--Train--")
evaluate_model(y_train, train_pred_RFPipe)
print("--Test--")
evaluate_model(y_test, test_pred_RFPipe)

--Train--
Confusion Matrix 
 [[1273    0    0]
 [   0 2450    0]
 [   0    0 1877]]
Accurcay :  1.0
Recall   :  1.0
Precision:  1.0
--Test--
Confusion Matrix 
 [[384 136  25]
 [ 88 916  47]
 [ 20  24 761]]
Accurcay :  0.8583923365264473
Recall   :  0.8583923365264473
Precision:  0.8561030794855155


In [97]:
Adaboost_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', AdaBoostClassifier())])

In [98]:
Adaboost_pipe.fit(X_train,y_train)

In [99]:
train_pred_adapipe = Adaboost_pipe.predict(X_train)
test_pred_adapipe = Adaboost_pipe.predict(X_test)

In [100]:
print("--Train--")
evaluate_model(y_train, train_pred_adapipe)
print("--Test--")
evaluate_model(y_test, test_pred_adapipe)

--Train--
Confusion Matrix 
 [[ 863  364   46]
 [ 239 2123   88]
 [  54   51 1772]]
Accurcay :  0.8496428571428571
Recall   :  0.8496428571428571
Precision:  0.8472809398880656
--Test--
Confusion Matrix 
 [[359 158  28]
 [111 895  45]
 [ 31  20 754]]
Accurcay :  0.8363182007496877
Recall   :  0.8363182007496877
Precision:  0.8334529286207462


In [101]:
gradeboost_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',GradientBoostingClassifier())])

In [102]:
gradeboost_pipe.fit(X_train,y_train)

In [103]:
train_pred_gradepipe = gradeboost_pipe.predict(X_train)
test_pred_gradepipe = gradeboost_pipe.predict(X_test)

In [104]:
print("--Train--")
evaluate_model(y_train, train_pred_gradepipe)
print("--Test--")
evaluate_model(y_test, test_pred_gradepipe)

--Train--
Confusion Matrix 
 [[1047  198   28]
 [ 111 2268   71]
 [  15   29 1833]]
Accurcay :  0.9192857142857143
Recall   :  0.9192857142857143
Precision:  0.9186019327461751
--Test--
Confusion Matrix 
 [[389 131  25]
 [ 88 919  44]
 [ 18  15 772]]
Accurcay :  0.8663057059558518
Recall   :  0.8663057059558518
Precision:  0.8638758037293539


In [105]:
from xgboost import XGBClassifier


In [106]:
xgb_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', XGBClassifier())])

In [107]:
xgb = XGBClassifier()

In [108]:
xgb_pipe.fit(X_train,y_train)





In [109]:
train_pred_xgbpipe = xgb_pipe.predict(X_train)
test_pred_xgbpipe = xgb_pipe.predict(X_test)

In [110]:
print("--Train--")
evaluate_model(y_train, train_pred_xgbpipe)
print("--Test--")
evaluate_model(y_test, test_pred_xgbpipe)

--Train--
Confusion Matrix 
 [[1273    0    0]
 [   0 2450    0]
 [   0    0 1877]]
Accurcay :  1.0
Recall   :  1.0
Precision:  1.0
--Test--
Confusion Matrix 
 [[407 112  26]
 [ 91 916  44]
 [ 20  18 767]]
Accurcay :  0.8704706372344856
Recall   :  0.8704706372344856
Precision:  0.8689167411128847


In [111]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'xgbClassifier', y_train,train_pred_xgbpipe, y_test,test_pred_xgbpipe)

In [112]:
performance_columns = ['Model name', 'Train accuracy', 'Train precision', 'Train recall','Test accuracy', 'Test precision', 'Test recall']
performance_comparison = pd.DataFrame(columns=performance_columns)

In [113]:
def add_to_perform_compare_df(df, model_name, train_actual, train_predict, test_actual, test_predict):
    
    from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
    
    train_accuracy = accuracy_score(train_actual, train_predict)
    test_accuracy = accuracy_score(test_actual, test_predict)
    
    train_recall = recall_score(train_actual, train_predict,average="weighted")
    test_recall = recall_score(test_actual,test_predict,average="weighted")
    
    train_precision = precision_score(train_actual, train_predict,average="weighted")
    test_precision = precision_score(test_actual, test_predict,average="weighted")
    
    df = df.append(pd.Series([model_name, train_accuracy, train_precision, train_recall,
                              test_accuracy, test_precision, test_recall],index=df.columns),ignore_index=True)
    return df

In [114]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Random Forest', y_train, train_pred_RFPipe, y_test, test_pred_RFPipe)

In [115]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'AdaBoostClassifie', y_train,train_pred_adapipe , y_test,test_pred_adapipe )

In [116]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'GradientBoostingClassifier', y_train,train_pred_gradepipe, y_test,test_pred_gradepipe)

In [145]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'xgbClassifier', y_train,train_pred_xgbpipe, y_test,test_pred_xgbpipe)

In [146]:
performance_comparison

Unnamed: 0,Model name,Train accuracy,Train precision,Train recall,Test accuracy,Test precision,Test recall
0,Random Forest,1.0,1.0,1.0,0.858392,0.856103,0.858392
1,AdaBoostClassifie,0.849643,0.847281,0.849643,0.836318,0.833453,0.836318
2,GradientBoostingClassifier,0.919286,0.918602,0.919286,0.866306,0.863876,0.866306
3,Gridsearch,0.86,0.863081,0.86,0.842149,0.844453,0.842149
4,upsampling,1.0,1.0,1.0,0.852561,0.85495,0.852561
5,xgbClassifier,1.0,1.0,1.0,0.870471,0.868917,0.870471


In [118]:
param_grid={"n_estimators":[50,100],
           "max_depth":[1,5],
            "max_features":[3,5],
            "min_samples_leaf":[1,2,3]
           }

In [119]:
clf3=RandomForestClassifier()
from sklearn.model_selection import GridSearchCV
clf_grid=GridSearchCV(clf3,param_grid,cv=2)


In [120]:
gridsearch = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',GridSearchCV(clf3,param_grid,cv=2))])

In [121]:
gridsearch.fit(X_train,y_train)

In [122]:
train_pred_gridepipe = gridsearch.predict(X_train)
test_pred_gridepipe = gridsearch.predict(X_test)

In [123]:
print("--Train--")
evaluate_model(y_train, train_pred_gridepipe)
print("--Test--")
evaluate_model(y_test, test_pred_gridepipe)

--Train--
Confusion Matrix 
 [[ 748  466   59]
 [  82 2265  103]
 [  23   51 1803]]
Accurcay :  0.86
Recall   :  0.86
Precision:  0.8630809175028126
--Test--
Confusion Matrix 
 [[298 213  34]
 [ 36 952  63]
 [ 13  20 772]]
Accurcay :  0.8421491045397751
Recall   :  0.8421491045397751
Precision:  0.8444530898144498


In [124]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Gridsearch', y_train,train_pred_gridepipe, y_test,test_pred_gridepipe)

In [125]:
performance_comparison

Unnamed: 0,Model name,Train accuracy,Train precision,Train recall,Test accuracy,Test precision,Test recall
0,Random Forest,1.0,1.0,1.0,0.858392,0.856103,0.858392
1,AdaBoostClassifie,0.849643,0.847281,0.849643,0.836318,0.833453,0.836318
2,GradientBoostingClassifier,0.919286,0.918602,0.919286,0.866306,0.863876,0.866306
3,Gridsearch,0.86,0.863081,0.86,0.842149,0.844453,0.842149


In [126]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=123)


In [127]:
smotepipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',SMOTE())])

In [128]:
smotepipe.fit(X_train,y_train)

In [129]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [130]:
pd.value_counts(y_train)/y_train.size * 100

1    43.750000
2    33.517857
0    22.732143
dtype: float64

In [131]:
cat_attr = X_train.select_dtypes(include=['category']).columns


In [132]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(drop = 'first')
enc.fit(X_train[cat_attr])

X_train_ohe=enc.transform(X_train[cat_attr]).toarray()
X_test_ohe=enc.transform(X_test[cat_attr]).toarray()

In [133]:
num_attr = X_train.select_dtypes(include=['float64', 'int64']).columns

In [134]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train[num_attr])

X_train_std = scaler.transform(X_train[num_attr])
X_test_std = scaler.transform(X_test[num_attr])

In [135]:
X_train_con = np.concatenate([X_train_std, X_train_ohe], axis=1)
X_test_con = np.concatenate([X_test_std, X_test_ohe], axis=1)

In [136]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=123)
X_train_sm, y_train_sm = smote.fit_resample(X_train_con, y_train)

In [137]:
clf2 = RandomForestClassifier()
clf2.fit(X_train_sm, y_train_sm)

train_pred_sm = clf2.predict(X_train_sm)
test_pred_sm = clf2.predict(X_test_con)

In [138]:
print("--Train--")
evaluate_model(y_train_sm, train_pred_sm)
print("--Test--")
evaluate_model(y_test, test_pred_sm)

--Train--
Confusion Matrix 
 [[2450    0    0]
 [   0 2450    0]
 [   0    0 2450]]
Accurcay :  1.0
Recall   :  1.0
Precision:  1.0
--Test--
Confusion Matrix 
 [[421 106  18]
 [131 879  41]
 [ 32  26 747]]
Accurcay :  0.8525614327363599
Recall   :  0.8525614327363599
Precision:  0.8549504052629635


In [139]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'upsampling', y_train_sm,train_pred_sm, y_test,test_pred_sm)

In [140]:
performance_comparison

Unnamed: 0,Model name,Train accuracy,Train precision,Train recall,Test accuracy,Test precision,Test recall
0,Random Forest,1.0,1.0,1.0,0.858392,0.856103,0.858392
1,AdaBoostClassifie,0.849643,0.847281,0.849643,0.836318,0.833453,0.836318
2,GradientBoostingClassifier,0.919286,0.918602,0.919286,0.866306,0.863876,0.866306
3,Gridsearch,0.86,0.863081,0.86,0.842149,0.844453,0.842149
4,upsampling,1.0,1.0,1.0,0.852561,0.85495,0.852561


In [141]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('Random Forest',RF_pipe),('GradientBoost',gradeboost_pipe),('Adaboost',Adaboost_pipe),('upsampling',clf2),('gridsearch',gridsearch)])


In [142]:
voting_clf

In [143]:
np.unique(y_train, return_counts= True)
np.unique(y_train_sm, return_counts= True)

(array([0, 1, 2]), array([2450, 2450, 2450]))

In [147]:
performance_comparison 

Unnamed: 0,Model name,Train accuracy,Train precision,Train recall,Test accuracy,Test precision,Test recall
0,Random Forest,1.0,1.0,1.0,0.858392,0.856103,0.858392
1,AdaBoostClassifie,0.849643,0.847281,0.849643,0.836318,0.833453,0.836318
2,GradientBoostingClassifier,0.919286,0.918602,0.919286,0.866306,0.863876,0.866306
3,Gridsearch,0.86,0.863081,0.86,0.842149,0.844453,0.842149
4,upsampling,1.0,1.0,1.0,0.852561,0.85495,0.852561
5,xgbClassifier,1.0,1.0,1.0,0.870471,0.868917,0.870471
