### Imports

In [1]:
# all the imports 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

### Reading the data

In [2]:
# reading the data
data = pd.read_csv('../combined_data/combined_fight_data.csv', low_memory=False)
data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,B_current_win_streak,...,R_age,location_elevation,end_method,end_how,end_round,city,country,attendance,R_home_elevation,B_home_elevation
0,gerard gordeau,kevin rosier,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,1.0,...,34.0,1734.0,tko,corner stoppage,,denver,usa,7800.0,1.0,146.0
1,royce gracie,ken shamrock,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,1.0,...,26.0,1734.0,submission,sleeve choke,,denver,usa,7800.0,27.0,1373.0
2,jason delucia,trent jenkins,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,0.0,...,24.0,1734.0,submission,rearnaked choke,,denver,usa,7800.0,89.0,
3,royce gracie,gerard gordeau,Joao Alberto Barreto,1993-11-12,Red,True,Catch Weight,1,0.0,2.0,...,26.0,1734.0,submission,rear naked choke,,denver,usa,7800.0,27.0,1.0
4,gerard gordeau,teila tuli,Joao Alberto Barreto,1993-11-12,Red,False,Open Weight,1,0.0,0.0,...,34.0,1734.0,tko,head kick,,denver,usa,7800.0,1.0,6.0


### Changing the labels column to bool type

In [3]:
# changing winner label to bool and keeping whether or not Red fighter won
data['Winner'] = data['Winner'].apply(lambda x: True if x == 'Red' else False)
data['R_Winner'] = data['Winner']

### Dropping useless columns

In [4]:
data = data.drop(columns=['R_fighter', 'B_fighter', 'Referee', 'date', 'city', 'country', 'Winner', 'end_how'])
data = data.fillna(0)

### One hot encoding the categorical data

In [5]:
# weight_class, B_Stance, R_Stance
data = pd.get_dummies(data, columns=['weight_class', 'B_Stance', 'R_Stance','end_method'])
data.shape

(5062, 175)

### Getting features

In [6]:
features = data.loc[:, data.columns != 'R_Winner']
features

Unnamed: 0,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,...,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch,end_method_0,end_method_decision,end_method_disqualification,end_method_ko,end_method_no contest,end_method_submission,end_method_tko
0,False,1,0.0,1.0,0.0,4.00,3.00,9.00,4.0,10.0,...,0,0,0,0,0,0,0,0,0,1
1,False,1,0.0,1.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
2,False,1,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
3,True,1,0.0,2.0,0.0,0.50,0.50,0.00,0.0,5.5,...,0,1,0,0,0,0,0,0,1,0
4,False,1,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5057,True,5,0.0,4.0,0.0,9.20,6.00,0.20,0.0,62.6,...,0,0,0,0,0,0,0,0,0,1
5058,False,3,0.0,1.0,0.0,17.00,14.50,2.50,2.0,201.0,...,0,1,0,0,1,0,0,0,0,0
5059,False,3,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
5060,False,3,0.0,1.0,0.0,7.25,4.75,1.75,0.5,125.0,...,0,0,0,0,0,0,1,0,0,0


### Getting labels

In [7]:
labels = data.loc[:, 'R_Winner']
labels

0        True
1        True
2        True
3        True
4        True
        ...  
5057     True
5058    False
5059     True
5060    False
5061    False
Name: R_Winner, Length: 5062, dtype: bool

### Running logistic regression

In [8]:
def run_logistic_regression(features, labels, params):
    logistic_cm = []
    logistic_fscore = []
    logistic_score = []
    
    for seed in range(1, 6):
        X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                            random_state=seed,
                                                            test_size=0.20)
        # scaling the data
        scaler = StandardScaler()
        scaler.fit(X_train)
            
        #transforming both training and testing data
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    

        # creating logistic classifier
        logistic = LogisticRegression(**params)

        # fitting training data and predicting test data
        logistic.fit(X_train_scaled, y_train)
        logistic_predict = logistic.predict(X_test_scaled)

        # compute the performance metrics
        logistic_cm.append(confusion_matrix(y_test, logistic_predict))
        logistic_fscore.append(f1_score(y_test, logistic_predict, average=None))
        logistic_score.append(accuracy_score(y_test, logistic_predict))

    print('Logistic confusion matrix:\n', np.mean(np.array(logistic_cm), axis=0))
    print('Logistic f-score:', np.mean(np.array(logistic_fscore)))  
    print('Logistic accuracy score:', np.mean(np.array(logistic_score)), '\n')  

In [9]:
params = {'solver': 'sag', 'random_state': 42, 'max_iter': 100}

run_logistic_regression(features, labels, params)

Logistic confusion matrix:
 [[104.2 211.6]
 [ 90.6 606.6]]
Logistic f-score: 0.6039448339898377
Logistic accuracy score: 0.7016781836130306 

