## 1. Reading the data

In [20]:
# all the imports 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

In [34]:
# reading the data
data = pd.read_csv('../combined_data/combined_fight_data.csv', low_memory=False)
data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,B_current_win_streak,...,R_age,location_elevation,end_method,end_how,end_round,city,country,attendance,R_home_elevation,B_home_elevation
0,gerard gordeau,kevin rosier,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,1.0,...,34.0,1734.0,tko,corner stoppage,,denver,usa,7800.0,1.0,146.0
1,royce gracie,ken shamrock,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,1.0,...,26.0,1734.0,submission,sleeve choke,,denver,usa,7800.0,27.0,1373.0
2,jason delucia,trent jenkins,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,0.0,...,24.0,1734.0,submission,rearnaked choke,,denver,usa,7800.0,89.0,
3,royce gracie,gerard gordeau,Joao Alberto Barreto,1993-11-12,Red,True,Catch Weight,1,0.0,2.0,...,26.0,1734.0,submission,rear naked choke,,denver,usa,7800.0,27.0,1.0
4,gerard gordeau,teila tuli,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,0.0,...,34.0,1734.0,tko,head kick,,denver,usa,7800.0,1.0,6.0


## 2. Data Preprocessing

### 2.1 Converting target column to bool type

In [22]:
# changing winner label to bool and keeping whether or not Red fighter won
data['Winner'] = data['Winner'].apply(lambda x: 1 if x == 'Red' else 0)
data['R_Winner'] = data['Winner']

### 2.2 Dropping the useless columns

In [23]:
data = data.drop(columns=['R_fighter', 'B_fighter', 'Referee', 'date', 'city', 'country', 'Winner', 'end_how'])
data = data.fillna(0)

### 2.3 One hot encoding the categorical columns

In [24]:
# weight_class, B_Stance, R_Stance
data = pd.get_dummies(data, columns=['weight_class', 'B_Stance', 'R_Stance','end_method'])
data.shape

(5062, 175)

### 2.4 Getting features and labels

In [25]:
features = data.loc[:, data.columns != 'R_Winner']
features

Unnamed: 0,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,...,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch,end_method_0,end_method_decision,end_method_disqualification,end_method_ko,end_method_no contest,end_method_submission,end_method_tko
0,False,1,0.0,1.0,0.0,4.00,3.00,9.00,4.0,10.0,...,0,0,0,0,0,0,0,0,0,1
1,False,1,0.0,1.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
2,False,1,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
3,True,1,0.0,2.0,0.0,0.50,0.50,0.00,0.0,5.5,...,0,1,0,0,0,0,0,0,1,0
4,False,1,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5057,True,5,0.0,4.0,0.0,9.20,6.00,0.20,0.0,62.6,...,0,0,0,0,0,0,0,0,0,1
5058,False,3,0.0,1.0,0.0,17.00,14.50,2.50,2.0,201.0,...,0,1,0,0,1,0,0,0,0,0
5059,False,3,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
5060,False,3,0.0,1.0,0.0,7.25,4.75,1.75,0.5,125.0,...,0,0,0,0,0,0,1,0,0,0


In [26]:
labels = data.loc[:, 'R_Winner'].astype('bool')
labels

0        True
1        True
2        True
3        True
4        True
        ...  
5057     True
5058    False
5059     True
5060    False
5061    False
Name: R_Winner, Length: 5062, dtype: bool

## 3. LogisticRegression, LinearSVC, KNeighborsClassifier

In [27]:
def run_logistic_regression(X_train, X_test, y_train, y_test, params):
    # creating logistic classifier
    logistic = LogisticRegression(**params)
    
    # fitting training data and predicting test data
    logistic.fit(X_train, y_train)
    logistic_predict = logistic.predict(X_test)
    
    # compute the performance metrics
    conf_matrix = confusion_matrix(y_test, logistic_predict)
    fscore = f1_score(y_test, logistic_predict, average=None)
    accr_score = accuracy_score(y_test, logistic_predict)
    
    # return the performance metrics
    return conf_matrix, fscore, accr_score


def run_knn_classification(X_train, X_test, y_train, y_test, params):
    # creating KNN classifier
    knn = KNeighborsClassifier(**params)
    
    # fitting training data and predicting test data
    knn.fit(X_train, y_train)
    knn_predict = knn.predict(X_test)

    # compute the performance metrics
    conf_matrix = confusion_matrix(y_test, knn_predict)
    fscore = f1_score(y_test, knn_predict, average=None)
    accr_score = accuracy_score(y_test, knn_predict)
    
    # return the performance metrics
    return conf_matrix, fscore, accr_score


def run_svc_classification(X_train, X_test, y_train, y_test, params):
    # creating SVC classifier
    svc = LinearSVC(**params)
    
    # fitting training data and predicting test data
    svc.fit(X_train, y_train)
    svc_predict = svc.predict(X_test)

    # compute the performance metrics
    conf_matrix = confusion_matrix(y_test, svc_predict)
    fscore = f1_score(y_test, svc_predict, average=None)
    accr_score = accuracy_score(y_test, svc_predict)
    
    # return the performance metrics
    return conf_matrix, fscore, accr_score


def run_all_models(features, labels, scaler, log_params, knn_params, svc_params):
    # arrays to store the results
    logistic_cm = []
    knn_cm = []
    svc_cm = []

    logistic_fscore = []
    knn_fscore = []
    svc_fscore = []

    logistic_score = []
    knn_score = []
    svc_score = []
    
    for seed in range(1, 6):
        X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                            random_state=seed,
                                                            test_size=0.20)
        
        
        X_train_scaled = X_train
        X_test_scaled = X_test
        
        # if scaler is provided then scale the data
        if(scaler != None):
            # fitting on training data
            scaler.fit(X_train)
            
            #transforming both training and testing data
            X_train_scaled = scaler.transform(X_train)
            X_test_scaled = scaler.transform(X_test)
    
        # run the models and get results
        logis_results = run_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test, log_params)
        knn_results = run_knn_classification(X_train_scaled, X_test_scaled, y_train, y_test, knn_params)
        svc_results = run_svc_classification(X_train_scaled, X_test_scaled, y_train, y_test, svc_params)
        
        # add the results to the result arrays
        logistic_cm.append(logis_results[0])
        knn_cm.append(knn_results[0])
        svc_cm.append(svc_results[0])

        logistic_fscore.append(logis_results[1])
        knn_fscore.append(knn_results[1])
        svc_fscore.append(svc_results[1])

        logistic_score.append(logis_results[2])
        knn_score.append(knn_results[2])
        svc_score.append(svc_results[2])
    
    
    # printing the results
    print('Logistic confusion matrix:\n', np.mean(np.array(logistic_cm), axis=0))
    print('Logistic f-score:', np.mean(np.array(logistic_fscore)))  
    print('Logistic accuracy score:', np.mean(np.array(logistic_score)), '\n')  

    print('KNN score:\n', np.mean(np.array(knn_cm), axis=0))
    print('KNN f-score:', np.mean(np.array(knn_fscore)))  
    print('KNN accuracy score:', np.mean(np.array(knn_score)), '\n')  

    print('SVC score:\n', np.mean(np.array(svc_cm), axis=0))
    print('SVC f-score:', np.mean(np.array(svc_fscore)))
    print('SVC accuracy score:', np.mean(np.array(svc_score)), '\n')  

### 3.1 Running LogisticRegression, LinearSVC, KNeighborsClassifier without Scaling

In [28]:
# logistic params
log_params = {'solver': 'liblinear', 'random_state': 42}

# knn params
knn_params = {'n_jobs': -1}

# svc params
svc_params = {'max_iter': 5000, 'random_state': 42}

run_all_models(features, labels, None, log_params, knn_params, svc_params)

Logistic confusion matrix:
 [[ 99.6 216.2]
 [ 89.6 607.6]]
Logistic f-score: 0.5964768792247483
Logistic accuracy score: 0.6981243830207304 

KNN score:
 [[ 89.  226.8]
 [132.6 564.6]]
KNN f-score: 0.5447978539022429
KNN accuracy score: 0.6452122408687069 

SVC score:
 [[116.8 199. ]
 [185.6 511.6]]
SVC f-score: 0.4849798388525272
SVC accuracy score: 0.620335636722606 



### 3.2 Running LogisticRegression, LinearSVC, KNeighborsClassifier with min max scaling

In [29]:
scaler = MinMaxScaler()
run_all_models(features, labels, scaler, log_params, knn_params, svc_params)

Logistic confusion matrix:
 [[ 90.8 225. ]
 [ 75.6 621.6]]
Logistic f-score: 0.5906466871473813
Logistic accuracy score: 0.7032576505429418 

KNN score:
 [[ 91.4 224.4]
 [133.2 564. ]]
KNN f-score: 0.5486742315651832
KNN accuracy score: 0.6469891411648568 

SVC score:
 [[ 95.8 220. ]
 [ 80.4 616.8]]
SVC f-score: 0.5964952802250998
SVC accuracy score: 0.7034550839091807 



### 3.3 Running LogisticRegression, LinearSVC, KNeighborsClassifier with standard scaling

In [30]:
scaler = StandardScaler()
run_all_models(features, labels, scaler, log_params, knn_params, svc_params)

Logistic confusion matrix:
 [[107.  208.8]
 [ 91.8 605.4]]
Logistic f-score: 0.6081343453846865
Logistic accuracy score: 0.7032576505429418 

KNN score:
 [[ 94.4 221.4]
 [134.6 562.6]]
KNN f-score: 0.5530575630899357
KNN accuracy score: 0.648568608094768 

SVC score:
 [[103.6 212.2]
 [ 86.8 610.4]]
SVC f-score: 0.6059265531180826
SVC accuracy score: 0.7048371174728529 



### 3.4 Parameter Tuning (using standard scaling)

In [37]:
def tune_parameters(model, parameters, features, labels):
    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                        random_state=42,
                                                        test_size=0.20)
    # scaling the data using standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # running grid search to get the best parameters
    grid_search_cv = GridSearchCV(model, parameters, cv=5)
    grid_search_cv.fit(X_train_scaled, y_train)
    
    print('Best parameters are:\n', grid_search_cv.best_params_ )
    return grid_search_cv.best_params_

In [32]:
# logistic classifier parameter tuning
log_clf = LogisticRegression(random_state=42, n_jobs=-1)
parameters = {'solver':('lbfgs', 'liblinear', 'sag', 'saga'), 
              'max_iter':[1000, 5000, 10000]}

log_params.update(tune_parameters(log_clf, parameters, features, labels))
log_params

Best parameters are:
 {'max_iter': 5000, 'solver': 'sag'}


{'solver': 'sag', 'random_state': 42, 'max_iter': 5000}

In [35]:
# knn classifier parameter tuning
knn = KNeighborsClassifier(n_jobs=-1)
parameters = {'weights':('uniform', 'distance'), 
              'n_neighbors':[2, 3, 5, 7, 9, 12]}

knn_params.update(tune_parameters(knn, parameters, features, labels))
knn_params

Best parameters are:
 {'n_neighbors': 12, 'weights': 'distance'}


{'n_jobs': -1, 'n_neighbors': 12, 'weights': 'distance'}

In [36]:
# svc classifier parameter tuning
svc = LinearSVC(random_state=42)
parameters = {'C':[0.01, 0.1, 1, 10, 100], 
              'max_iter':[1000, 5000, 10000]}

svc_params.update(tune_parameters(svc, parameters, features, labels))
svc_params

Best parameters are:
 {'C': 0.1, 'max_iter': 5000}


{'max_iter': 5000, 'random_state': 42, 'C': 0.1}

### 3.5 Running models with best parameters and standard scaling

In [38]:
scaler = StandardScaler()
run_all_models(features, labels, scaler, log_params, knn_params, svc_params)

Logistic confusion matrix:
 [[107.  208.8]
 [ 91.6 605.6]]
Logistic f-score: 0.6082901779386335
Logistic accuracy score: 0.7034550839091807 

KNN score:
 [[ 77.6 238.2]
 [ 87.2 610. ]]
KNN f-score: 0.5561464977268612
KNN accuracy score: 0.6787759131293188 

SVC score:
 [[101.  214.8]
 [ 85.8 611.4]]
SVC f-score: 0.6019923371532229
SVC accuracy score: 0.7032576505429418 



### 3.6 Dimension Reduction (using best parameters and standard scaling) 

In [44]:
def reduce_dimensions_and_run_models(model, features, labels, log_params, knn_params, svc_params):
    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                        random_state=42,
                                                        test_size=0.20)
    
    # scaling the data using standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # fitting and transforming data using dimension reduction model
    model.fit(X_train_scaled, y_train)
    features_transformed = model.transform(features)
    
    # running all the models to test the reduced dimensions
    scaler = StandardScaler()
    run_all_models(features_transformed, labels, scaler, log_params, knn_params, svc_params)

In [45]:
# PCA dimension reduction
pca = PCA(n_components=120)

reduce_dimensions_and_run_models(pca, features, labels, log_params, knn_params, svc_params)

Logistic confusion matrix:
 [[ 28.2 287.6]
 [ 26.6 670.6]]
Logistic f-score: 0.4811352393259739
Logistic accuracy score: 0.6898321816386969 

KNN score:
 [[ 66.6 249.2]
 [ 90.4 606.8]]
KNN f-score: 0.5314714940850549
KNN accuracy score: 0.6647581441263574 

SVC score:
 [[ 25.4 290.4]
 [ 20.8 676.4]]
SVC f-score: 0.47649826799390527
SVC accuracy score: 0.6927936821322803 



In [46]:
# RFE dimension reduction
linear_reg = LinearRegression()
rfe = RFE(linear_reg, n_features_to_select=120)

reduce_dimensions_and_run_models(rfe, features, labels, log_params, knn_params, svc_params)

Logistic confusion matrix:
 [[ 98.4 217.4]
 [ 82.8 614.4]]
Logistic f-score: 0.5996695712570189
Logistic accuracy score: 0.7036525172754196 

KNN score:
 [[ 81.8 234. ]
 [ 98.8 598.4]]
KNN f-score: 0.5559731264453662
KNN accuracy score: 0.6714708785784798 

SVC score:
 [[ 90.6 225.2]
 [ 74.  623.2]]
SVC f-score: 0.5914854995751821
SVC accuracy score: 0.704639684106614 



## 4. RandomForest Classification

In [47]:
def run_random_forest(features, labels, params):
    # arrays to store the results
    forest_cm = []
    forest_fscore = []
    forest_score = []
    
    for seed in range(1, 6):
        X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                            random_state=seed,
                                                            test_size=0.20)
        
        
        # run the random forest model and get results
        random_forest = RandomForestClassifier(**params)
        random_forest.fit(X_train, y_train)
        
        forest_predict = random_forest.predict(X_test)

        # compute the performance metrics
        forest_cm.append(confusion_matrix(y_test, forest_predict))
        forest_fscore.append(f1_score(y_test, forest_predict, average=None))
        forest_score.append(accuracy_score(y_test, forest_predict))
        
    # printing the results
    print('Random Forest Confusion Matrix:\n', np.mean(np.array(forest_cm), axis=0))
    print('Random Forest F-Score:', np.mean(np.array(forest_fscore)))  
    print('Random Forest Accuracy Score:', np.mean(np.array(forest_score)), '\n')

### 4.1 Random Forest Classifier With No Parameter Tuning

In [48]:
# parameters for the random forest model
forest_params = {'n_estimators':5000, 'random_state':42, 'n_jobs':-1}

# running the random forest model
run_random_forest(features, labels, forest_params)

Random Forest Confusion Matrix:
 [[ 50.2 265.6]
 [ 32.4 664.8]]
Random Forest F-Score: 0.5343338220457865
Random Forest Accuracy Score: 0.7058242843040474 



### 4.2 Random Forest Classifier Parameter Tuning

In [49]:
def tune_rf_parameters(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                        random_state=42,
                                                        test_size=0.20)

    random_forest = RandomForestClassifier(n_estimators=5000,
                                           random_state=42, 
                                           n_jobs=-1)

    parameters = {'max_features':('auto', 'sqrt', 'log2'), 
              'max_depth':[5, 10, 15], 
              'min_samples_split':[2, 7, 15]}

    grid_search_cv = GridSearchCV(random_forest, parameters, cv=5)
    grid_search_cv.fit(X_train, y_train)

    print('Best parameters are:\n', grid_search_cv.best_params_)
    return grid_search_cv.best_params_

In [50]:
# getting best params
forest_params.update(tune_rf_parameters(features, labels))
forest_params

Best parameters are:
 {'max_depth': 15, 'max_features': 'auto', 'min_samples_split': 7}


{'n_estimators': 5000,
 'random_state': 42,
 'n_jobs': -1,
 'max_depth': 15,
 'max_features': 'auto',
 'min_samples_split': 7}

In [51]:
# running random forest with best params
run_random_forest(features, labels, forest_params)

Random Forest Confusion Matrix:
 [[ 44.  271.8]
 [ 28.6 668.6]]
Random Forest F-Score: 0.5215822613052005
Random Forest Accuracy Score: 0.7034550839091807 



### 4.3 Dimension Reduction - RFE (Using Best Parameters)

In [52]:
def reduce_dimensions_and_run_rf(features, labels, forest_params):
    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                        random_state=42,
                                                        test_size=0.20)
    
    # creating dimension reduction model
    linear_reg = LinearRegression()
    rfe = RFE(linear_reg, n_features_to_select=120)
    
    # fitting dimension reduction model
    rfe = rfe.fit(X_train, y_train)

    # transforming dimension reduction model
    features_transformed = pca.transform(features)
    
    run_random_forest(features_transformed, labels, forest_params)

In [53]:
reduce_dimensions_and_run_rf(features, labels, forest_params)

Random Forest Confusion Matrix:
 [[ 40.4 275.4]
 [ 41.4 655.8]]
Random Forest F-Score: 0.5041825352748834
Random Forest Accuracy Score: 0.6872655478775913 

