### Reading the data

In [31]:
import pandas as pd

# reading the data
data = pd.read_csv('../ufcdata/data_with_location_elevation.csv', low_memory=False)
data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,location_elevation
0,Minotauro Nogueira,Roy Nelson,Leon Roberts,2014-04-11,"Abu Dhabi, United Arab Emirates",Blue,False,Heavyweight,5,2.0,...,2.0,0.0,5.0,Orthodox,190.5,195.58,240.0,37.0,37.0,27.0
1,Clay Guida,Tatsuya Kawajiri,Marc Goddard,2014-04-11,"Abu Dhabi, United Arab Emirates",Red,False,Featherweight,3,0.0,...,4.0,0.0,10.0,Orthodox,170.18,177.8,155.0,35.0,32.0,27.0
2,John Howard,Ryan LaFlare,Leon Roberts,2014-04-11,"Abu Dhabi, United Arab Emirates",Blue,False,Welterweight,3,0.0,...,0.0,0.0,6.0,Orthodox,170.18,180.34,170.0,30.0,31.0,27.0
3,Ramsey Nijem,Beneil Dariush,Neil Hall,2014-04-11,"Abu Dhabi, United Arab Emirates",Red,False,Lightweight,3,0.0,...,0.0,0.0,4.0,Orthodox,180.34,190.5,155.0,24.0,26.0,27.0
4,Jared Rosholt,Daniel Omielanczuk,Marc Goddard,2014-04-11,"Abu Dhabi, United Arab Emirates",Red,False,Heavyweight,3,0.0,...,0.0,0.0,1.0,Orthodox,187.96,190.5,265.0,31.0,27.0,27.0


### Dropping useless columns and filling NaNs with 0

In [32]:
# maybe want to use the date *************
data = data.drop(columns=['Referee', 'date', 'location', 'B_fighter', 'R_fighter'])
data = data.fillna(0)

### One hot encoding

In [33]:
# weight_class, B_Stance, R_Stance
data = pd.get_dummies(data, columns=['weight_class', 'B_Stance', 'R_Stance'])
data['Winner'] = data['Winner'].apply(lambda x: 1 if x == 'Red' else 0)

### Getting features and labels

In [34]:
features = data.loc[:, data.columns != 'Winner']
features

Unnamed: 0,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,...,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_0,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch
0,False,5,2.0,0.0,0.0,2.454545,1.454545,5.909091,2.363636,46.636364,...,1,0,0,0,0,0,1,0,0,0
1,False,3,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,10.000000,...,1,0,0,0,0,0,1,0,0,0
2,False,3,0.0,3.0,0.0,33.000000,18.333333,25.333333,12.000000,106.000000,...,0,0,1,0,0,0,1,0,0,0
3,False,3,0.0,1.0,0.0,4.000000,1.000000,1.000000,1.000000,12.000000,...,0,0,1,0,0,0,1,0,0,0
4,False,3,0.0,1.0,0.0,10.000000,7.000000,19.000000,18.000000,50.000000,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5139,False,3,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,1,0,0,0,0,0,1,0,0,0
5140,False,3,1.0,0.0,0.0,6.000000,3.000000,1.000000,1.000000,17.000000,...,1,0,0,0,0,0,1,0,0,0
5141,False,3,2.0,0.0,0.0,4.500000,3.500000,4.000000,1.833333,62.666667,...,0,0,1,0,0,0,0,0,1,0
5142,False,3,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,1,0,0,0,0,0,0,0,0,1


In [35]:
labels = data.loc[:, 'Winner']
labels

0       0
1       1
2       0
3       1
4       1
       ..
5139    1
5140    0
5141    0
5142    1
5143    0
Name: Winner, Length: 5144, dtype: int64

### LogisticRegression, LinearSVC, KNeighborsClassifier for Baseline

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
import numpy as np
import warnings
warnings.filterwarnings('ignore')

logistic_score = []
knn_score = []
svc_score = []

for seed in range(1, 6):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                        random_state=seed,
                                                        test_size=0.20)

    logistic = LogisticRegression(solver='liblinear', random_state=42)
    logistic.fit(X_train, y_train)
    logistic_score.append(logistic.score(X_test, y_test))
    
    knn = KNeighborsClassifier(n_jobs=-1)
    knn.fit(X_train, y_train)
    knn_score.append(knn.score(X_test, y_test))

    svc = LinearSVC(random_state=42, max_iter=5000)
    svc.fit(X_train, y_train)
    svc_score.append(svc.score(X_test, y_test))
    
print('Logistic score:', np.mean(np.array(logistic_score)))
print('KNN score:', np.mean(np.array(knn_score)))
print('SVC score:', np.mean(np.array(svc_score)))

Logistic score: 0.6831875607385811
KNN score: 0.626044703595724
SVC score: 0.6625850340136055


### LogisticRegression, LinearSVC, KNeighborsClassifier with Scaling

In [51]:
# Min Max Scalar
from sklearn.preprocessing import MinMaxScaler

logistic_score = []
knn_score = []
svc_score = []

for seed in range(1, 6):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                        random_state=seed,
                                                        test_size=0.20)
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    logistic = LogisticRegression(solver='liblinear', random_state=42)
    logistic.fit(X_train_scaled, y_train)
    logistic_score.append(logistic.score(X_test_scaled, y_test))
    
    knn = KNeighborsClassifier(n_jobs=-1)
    knn.fit(X_train_scaled, y_train)
    knn_score.append(knn.score(X_test_scaled, y_test))

    svc = LinearSVC(random_state=42, max_iter=5000)
    svc.fit(X_train_scaled, y_train)
    svc_score.append(svc.score(X_test_scaled, y_test))
    
print('Logistic score:', np.mean(np.array(logistic_score)))
print('KNN score:', np.mean(np.array(knn_score)))
print('SVC score:', np.mean(np.array(svc_score)))

Logistic score: 0.6785228377065111
KNN score: 0.6351797862001943
SVC score: 0.6793002915451894


In [52]:
# Standard Scalar
from sklearn.preprocessing import StandardScaler

logistic_score = []
knn_score = []
svc_score = []

for seed in range(1, 6):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                        random_state=seed,
                                                        test_size=0.20)

    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    logistic = LogisticRegression(solver='liblinear', random_state=42, n_jobs=-1)
    logistic.fit(X_train_scaled, y_train)
    logistic_score.append(logistic.score(X_test_scaled, y_test))
    
    knn = KNeighborsClassifier(n_jobs=-1)
    knn.fit(X_train_scaled, y_train)
    knn_score.append(knn.score(X_test_scaled, y_test))

    svc = LinearSVC(random_state=42, max_iter=5000)
    svc.fit(X_train_scaled, y_train)
    svc_score.append(svc.score(X_test_scaled, y_test))
    
print('Logistic score:', np.mean(np.array(logistic_score)))
print('KNN score:', np.mean(np.array(knn_score)))
print('SVC score:', np.mean(np.array(svc_score)))

Logistic score: 0.6785228377065111
KNN score: 0.6396501457725947
SVC score: 0.681438289601555


### Parameter Tuning for LogisticRegression with Standard Scaling

In [53]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                    random_state=seed,
                                                    test_size=0.20)
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

logistic = LogisticRegression(random_state=42, n_jobs=-1)
parameters = {'solver':('lbfgs', 'liblinear', 'sag', 'saga'), 
              'max_iter':[1000, 5000, 10000]}

grid_search_cv = GridSearchCV(logistic, parameters, cv=5)
grid_search_cv.fit(X_train_scaled, y_train)

print('Best parameters are:\n', grid_search_cv.best_params_ )
print()
print('Logistic Score:', grid_search_cv.score(X_test_scaled, y_test))

Best parameters are:
 {'max_iter': 1000, 'solver': 'saga'}

Logistic Score: 0.6783284742468416


### Parameter Tuning for KNN with Standard Scaling

In [56]:
knn = KNeighborsClassifier(n_jobs=-1)
parameters = {'weights':('uniform', 'distance'), 
              'n_neighbors':[2, 3, 5, 7, 9, 12]}

grid_search_cv = GridSearchCV(knn, parameters, cv=5)
grid_search_cv.fit(X_train_scaled, y_train)

print('Best parameters are:\n', grid_search_cv.best_params_ )
print()
print('Logistic Score:', grid_search_cv.score(X_test_scaled, y_test))

Best parameters are:
 {'n_neighbors': 12, 'weights': 'distance'}

Logistic Score: 0.6647230320699709


### Parameter Tuning for SVC with Standard Scaling

In [57]:
svc = LinearSVC(random_state=42)
parameters = {'C':[0.01, 0.1, 1, 10, 100], 
              'max_iter':[1000, 5000, 10000]}

grid_search_cv = GridSearchCV(svc, parameters, cv=5)
grid_search_cv.fit(X_train_scaled, y_train)

print('Best parameters are:\n', grid_search_cv.best_params_ )
print()
print('Logistic Score:', grid_search_cv.score(X_test_scaled, y_test))

Best parameters are:
 {'C': 0.01, 'max_iter': 1000}

Logistic Score: 0.6793002915451894


### PCA Dimension Reduction and Using Best Parameters

In [67]:
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

pca = PCA(n_components=120)
pca.fit(X_train_scaled, y_train)

X_train_scaled_transformed = pca.transform(X_train_scaled)
X_test_scaled_transformed = pca.transform(X_test_scaled)

logistic = LogisticRegression(solver='saga', random_state=42, 
                              n_jobs=-1, max_iter=1000)
logistic.fit(X_train_scaled_transformed, y_train)

knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=12, weights='distance')
knn.fit(X_train_scaled_transformed, y_train)

svc = LinearSVC(random_state=42, C=0.01, max_iter=1000)
svc.fit(X_train_scaled_transformed, y_train)

logistic_score = np.array(cross_val_score(logistic, X_test_scaled_transformed, y_test, cv=5))
knn_score = np.array(cross_val_score(knn, X_test_scaled_transformed, y_test, cv=5))
svc_score = np.array(cross_val_score(svc, X_test_scaled_transformed, y_test, cv=5))

print('Logistic score:', np.mean(logistic_score))
print('KNN score:', np.mean(knn_score))
print('SVC score:', np.mean(svc_score))

Logistic score: 0.6559995012360423
KNN score: 0.6268448718256706
SVC score: 0.6676503527382256


### RFE Dimension Reduction and Using Best Parameters

In [69]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
 
linear_reg = LinearRegression()

rfe = RFE(linear_reg, n_features_to_select=120)
rfe = rfe.fit(X_train_scaled, y_train)

X_train_scaled_transformed = rfe.transform(X_train_scaled)
X_test_scaled_transformed = rfe.transform(X_test_scaled)

logistic = LogisticRegression(solver='saga', random_state=42, 
                              n_jobs=-1, max_iter=1000)
logistic.fit(X_train_scaled_transformed, y_train)

knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=12, weights='distance')
knn.fit(X_train_scaled_transformed, y_train)

svc = LinearSVC(random_state=42, C=0.01, max_iter=1000)
svc.fit(X_train_scaled_transformed, y_train)

logistic_score = np.array(cross_val_score(logistic, X_test_scaled_transformed, y_test, cv=5))
knn_score = np.array(cross_val_score(knn, X_test_scaled_transformed, y_test, cv=5))
svc_score = np.array(cross_val_score(svc, X_test_scaled_transformed, y_test, cv=5))

print('Logistic score:', np.mean(logistic_score))
print('KNN score:', np.mean(knn_score))
print('SVC score:', np.mean(svc_score))

Logistic score: 0.6628382414681049
KNN score: 0.6307473108500609
SVC score: 0.6744465607594023


### Trying to beat all scores with RandomForest

In [78]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                    random_state=0,
                                                    test_size=0.20)

random_forest = RandomForestClassifier(n_estimators=5000, random_state=42,
                                       n_jobs=-1)
random_forest.fit(X_train, y_train)

rf_score = np.array(cross_val_score(random_forest, X_test, y_test, cv=5))
print('RF score:', np.mean(rf_score))

RF score: 0.6822164338148237


### Parameter Tuning for RandomForest

In [81]:
random_forest = RandomForestClassifier(n_estimators=5000,
                                       random_state=42, 
                                       n_jobs=-1)

parameters = {'max_features':('auto', 'sqrt', 'log2'), 
              'max_depth':[5, 10, 15], 
              'min_samples_split':[2, 7, 15]}

grid_search_cv = GridSearchCV(random_forest, parameters, cv=5)
grid_search_cv.fit(X_train, y_train)

print('Best parameters are:\n', grid_search_cv.best_params_)
print()
print('RF Score:', grid_search_cv.score(X_test, y_test))

Best parameters are:
 {'max_depth': 15, 'max_features': 'auto', 'min_samples_split': 2}

RF Score: 0.6997084548104956


### RFE Dimension Reduction and Using Best Parameters

In [85]:
# Using linear regression to reduce dimensions
linear_reg = LinearRegression()

rfe = RFE(linear_reg, n_features_to_select=120)
rfe = rfe.fit(X_train, y_train)

X_train_transformed = rfe.transform(X_train)
X_test_transformed = rfe.transform(X_test)

random_forest = RandomForestClassifier(n_estimators=5000,
                                       random_state=42,
                                       n_jobs=-1,
                                       max_features='auto',
                                       max_depth=15,
                                       min_samples_split=2)

random_forest.fit(X_train_transformed, y_train)
rf_score = np.array(cross_val_score(random_forest, X_test_transformed, y_test, cv=5))

print('RF score:', np.mean(rf_score))

RF score: 0.6695856026521432


In [86]:
# Using random forest to reduce dimensions
random_forest = RandomForestClassifier(n_estimators=200,
                                       random_state=42,
                                       n_jobs=-1)

rfe = RFE(random_forest, n_features_to_select=120)
rfe = rfe.fit(X_train, y_train)

X_train_transformed = rfe.transform(X_train)
X_test_transformed = rfe.transform(X_test)

random_forest = RandomForestClassifier(n_estimators=5000,
                                       random_state=42,
                                       n_jobs=-1,
                                       max_features='auto',
                                       max_depth=15,
                                       min_samples_split=2)

random_forest.fit(X_train_transformed, y_train)
rf_score = np.array(cross_val_score(random_forest, X_test_transformed, y_test, cv=5))

print('RF score:', np.mean(rf_score))

RF score: 0.6841487094482596
