# Fight Prediction

We still want to predict the winner of UFC's fight based on a dataset with about 5000 fights and over 100 features such as stance, rank, time, finish, betting odds, striking percentage, number of takedown attempt, ...

We will try to increase our accuracy using two classification models:
- Random Forest Model
- Logistic Regression Model



In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
pd.set_option("display.max_columns", None)

Exploring the data

In [3]:
df = pd.read_csv("data/ufc-master.csv")
df

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,B_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,B_match_weightclass_rank,R_match_weightclass_rank,R_Women's Flyweight_rank,R_Women's Featherweight_rank,R_Women's Strawweight_rank,R_Women's Bantamweight_rank,R_Heavyweight_rank,R_Light Heavyweight_rank,R_Middleweight_rank,R_Welterweight_rank,R_Lightweight_rank,R_Featherweight_rank,R_Bantamweight_rank,R_Flyweight_rank,R_Pound-for-Pound_rank,B_Women's Flyweight_rank,B_Women's Featherweight_rank,B_Women's Strawweight_rank,B_Women's Bantamweight_rank,B_Heavyweight_rank,B_Light Heavyweight_rank,B_Middleweight_rank,B_Welterweight_rank,B_Lightweight_rank,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Light Heavyweight,MALE,5,0,1,0,3.420000,0.59,0.700000,0.240000,1.00,4,2,11,0,0,0,1,4,0,0,5,Orthodox,198.12,208.28,205,3,0,0,3.950000,0.480000,0.100000,0.610000,0.360,4,8,43,1,0,0,2,11,0,0,13,Orthodox,187.96,193.04,205,37,29,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.530000,0.600000,-0.370000,1,1,10.0,5.0,,,,,,5.0,,,,,,,,,,,,,10.0,,,,,,,,Red,U-DEC,,5.0,5:00,1500.0,800.0,900.0,2000.0,1600.0,-110.0,175.0
1,Alex Oliveira,Niko Price,170.0,-200,170.000000,50.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Welterweight,MALE,3,2,0,0,5.160000,0.42,0.800000,0.790000,0.22,2,5,24,0,0,0,0,4,2,0,6,Orthodox,182.88,193.04,170,2,0,0,2.970000,0.510000,0.500000,2.270000,0.370,4,8,44,0,0,1,3,4,3,0,11,Orthodox,180.34,193.04,170,33,32,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.00,-1,2.190000,0.300000,-1.480000,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,U-DEC,,3.0,5:00,900.0,450.0,350.0,700.0,1100.0,550.0,120.0
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.000000,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Middleweight,MALE,3,1,0,0,2.920000,0.41,0.100000,1.150000,0.34,5,5,38,0,0,2,6,1,0,0,9,Southpaw,185.42,195.58,185,1,0,0,3.770000,0.490000,1.700000,4.480000,0.440,4,4,13,0,0,0,0,1,5,0,6,Orthodox,190.50,195.58,205,34,32,0,0,1,3,1,25,0,0,-5,-5.08,0.00,-2,-0.850000,-1.600000,-3.330000,1,1,,,,,,,,13.0,,,,,,,,,,,,,,,,,,,,,neither,S-DEC,,3.0,5:00,900.0,550.0,275.0,275.0,1400.0,600.0,185.0
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Lightweight,MALE,3,1,0,0,4.040000,0.34,0.000000,0.000000,0.00,0,1,3,0,0,0,0,0,0,0,0,Orthodox,177.80,177.80,155,1,0,0,3.790000,0.360000,0.000000,1.570000,0.350,2,3,15,0,0,0,2,2,0,0,4,Orthodox,175.26,182.88,155,29,32,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.250000,0.000000,-1.570000,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,KO/TKO,Punch,1.0,1:20,80.0,175.0,900.0,500.0,3500.0,110.0,1100.0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,MALE,3,0,2,0,5.220000,0.56,0.000000,2.540000,0.39,2,3,19,0,0,0,4,1,0,0,5,Orthodox,175.26,172.72,145,0,4,0,2.640000,0.620000,0.600000,2.850000,0.520,4,0,8,0,0,0,2,0,2,0,4,Orthodox,175.26,177.80,155,28,33,0,-2,-2,1,3,11,0,1,-2,0.00,-5.08,5,2.580000,-0.600000,-0.310000,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,S-DEC,,3.0,5:00,900.0,165.0,200.0,400.0,1200.0,900.0,600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,Duane Ludwig,Darren Elkins,-155.0,135,64.516129,135.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Blue,False,Lightweight,MALE,3,0,0,0,,,,,,0,0,0,0,0,0,0,0,0,0,0,Orthodox,177.80,180.34,145,1,0,0,13.666667,0.576667,0.000000,0.000000,0.000,2,1,5,0,0,0,1,1,0,0,2,Orthodox,177.80,177.80,170,31,25,1,0,-2,-2,1,-5,0,-1,0,0.00,2.54,6,-13.666667,0.000000,0.000000,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,KO/TKO,,1.0,0:44,44.0,,,,,,
4892,John Howard,Daniel Roberts,-210.0,175,47.619048,175.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,0,0,0,,,,,,0,0,0,0,0,0,0,0,0,0,0,Southpaw,177.80,187.96,170,0,3,0,18.000000,0.550000,1.000000,4.666667,0.790,3,0,9,0,0,2,0,1,0,0,3,Orthodox,170.18,180.34,170,27,29,0,-3,-3,-3,0,-9,0,-1,0,7.62,7.62,-2,-18.000000,-1.000000,-4.666667,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,KO/TKO,Punch,1.0,2:01,121.0,,,,,,
4893,Brendan Schaub,Chase Gormley,-260.0,220,38.461538,220.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Heavyweight,MALE,3,1,0,0,8.000000,0.34,1.000000,1.000000,1.00,0,1,1,0,0,0,0,0,0,0,0,Orthodox,190.50,196.00,265,1,0,0,12.000000,0.250000,0.000000,0.000000,0.000,0,1,1,1,0,0,0,0,0,0,0,Orthodox,193.04,198.12,245,27,27,0,0,0,0,0,0,-1,0,0,-2.54,-2.12,0,-4.000000,1.000000,1.000000,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,KO/TKO,Punches,1.0,0:47,47.0,,,,,,
4894,Mike Pierce,Julio Paulino,-420.0,335,23.809524,335.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,0,0,0,,,,,,0,0,0,0,0,0,0,0,0,0,0,Orthodox,182.88,185.42,170,1,0,0,40.500000,0.405000,0.000000,3.500000,0.520,1,1,6,0,0,0,1,0,0,0,1,Orthodox,172.72,177.80,170,29,34,1,0,-1,-1,1,-6,0,0,0,10.16,7.62,-5,-40.500000,0.000000,-3.500000,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,U-DEC,,3.0,5:00,900.0,,,,,,


We have the ranking in each division for each fighter (12 division, 2 fighter, so 24 columns per row). Since a fighter mostly fight in a single division, most of those columns will have a value of NaN. Let's aggregate and take the average of those values and have only one ranking column per fighter.

In [4]:
#Aggregate the rank into a single columns 

def get_rank_r(row):
    """Calculate the average ranking across all divisions for fighter Red"""
    ranks = row["R_Women's Flyweight_rank"], row["R_Women's Featherweight_rank"], row["R_Women's Strawweight_rank"], row["R_Women's Bantamweight_rank"], row["R_Heavyweight_rank"], row["R_Light Heavyweight_rank"], row["R_Middleweight_rank"], row["R_Welterweight_rank"], row["R_Lightweight_rank"], row["R_Featherweight_rank"], row["R_Bantamweight_rank"], row["R_Flyweight_rank"], row["R_Pound-for-Pound_rank"], row["R_match_weightclass_rank"]
    ranks = list(ranks)
    nan_bool = np.isnan(ranks)
    weights = [1 if ~n else 0 for n in nan_bool]
    masked_data = np.ma.masked_array(ranks, np.isnan(ranks))
    if sum(weights):
        return np.average(masked_data, weights=weights, axis=0)
    if not sum(weights):
        return np.nan
def get_rank_b(row):
    """Calculate the average ranking across all divisions for fighter Blue"""
    ranks = row["B_Women's Flyweight_rank"], row["B_Women's Featherweight_rank"], row["B_Women's Strawweight_rank"], row["B_Women's Bantamweight_rank"], row["B_Heavyweight_rank"], row["B_Light Heavyweight_rank"], row["B_Middleweight_rank"], row["B_Welterweight_rank"], row["B_Lightweight_rank"], row["B_Featherweight_rank"], row["B_Bantamweight_rank"], row["B_Flyweight_rank"], row["B_Pound-for-Pound_rank"], row["B_match_weightclass_rank"]
    ranks = list(ranks)
    nan_bool = np.isnan(ranks)
    weights = [1 if ~n else 0 for n in nan_bool]
    masked_data = np.ma.masked_array(ranks, np.isnan(ranks))
    if sum(weights):
        return np.average(masked_data, weights=weights, axis=0)
    if not sum(weights):
        return np.nan
    

# Add new ranking columns
df["R_rank"] = df.apply(lambda row: get_rank_r(row), axis=1)
df["B_rank"] = df.apply(lambda row: get_rank_b(row), axis=1)


In [5]:
# Drop all those now useless columns
print(df.shape)
df = df.drop(["R_Women's Flyweight_rank","R_Women's Featherweight_rank","R_Women's Strawweight_rank","R_Women's Bantamweight_rank","R_Heavyweight_rank","R_Light Heavyweight_rank","R_Middleweight_rank","R_Welterweight_rank","R_Lightweight_rank","R_Featherweight_rank","R_Bantamweight_rank","R_Flyweight_rank", "R_Pound-for-Pound_rank", "R_match_weightclass_rank"], axis=1)
df = df.drop(["B_Women's Flyweight_rank","B_Women's Featherweight_rank","B_Women's Strawweight_rank","B_Women's Bantamweight_rank","B_Heavyweight_rank","B_Light Heavyweight_rank","B_Middleweight_rank","B_Welterweight_rank","B_Lightweight_rank","B_Featherweight_rank","B_Bantamweight_rank","B_Flyweight_rank", "B_Pound-for-Pound_rank", "B_match_weightclass_rank"], axis=1)

print(df.shape)
df

(4896, 121)
(4896, 93)


Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,B_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds,R_rank,B_rank
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Light Heavyweight,MALE,5,0,1,0,3.420000,0.59,0.700000,0.240000,1.00,4,2,11,0,0,0,1,4,0,0,5,Orthodox,198.12,208.28,205,3,0,0,3.950000,0.480000,0.100000,0.610000,0.360,4,8,43,1,0,0,2,11,0,0,13,Orthodox,187.96,193.04,205,37,29,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.530000,0.600000,-0.370000,1,1,Red,U-DEC,,5.0,5:00,1500.0,800.0,900.0,2000.0,1600.0,-110.0,175.0,5.0,10.0
1,Alex Oliveira,Niko Price,170.0,-200,170.000000,50.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Welterweight,MALE,3,2,0,0,5.160000,0.42,0.800000,0.790000,0.22,2,5,24,0,0,0,0,4,2,0,6,Orthodox,182.88,193.04,170,2,0,0,2.970000,0.510000,0.500000,2.270000,0.370,4,8,44,0,0,1,3,4,3,0,11,Orthodox,180.34,193.04,170,33,32,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.00,-1,2.190000,0.300000,-1.480000,1,1,neither,U-DEC,,3.0,5:00,900.0,450.0,350.0,700.0,1100.0,550.0,120.0,,
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.000000,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Middleweight,MALE,3,1,0,0,2.920000,0.41,0.100000,1.150000,0.34,5,5,38,0,0,2,6,1,0,0,9,Southpaw,185.42,195.58,185,1,0,0,3.770000,0.490000,1.700000,4.480000,0.440,4,4,13,0,0,0,0,1,5,0,6,Orthodox,190.50,195.58,205,34,32,0,0,1,3,1,25,0,0,-5,-5.08,0.00,-2,-0.850000,-1.600000,-3.330000,1,1,neither,S-DEC,,3.0,5:00,900.0,550.0,275.0,275.0,1400.0,600.0,185.0,13.0,
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Lightweight,MALE,3,1,0,0,4.040000,0.34,0.000000,0.000000,0.00,0,1,3,0,0,0,0,0,0,0,0,Orthodox,177.80,177.80,155,1,0,0,3.790000,0.360000,0.000000,1.570000,0.350,2,3,15,0,0,0,2,2,0,0,4,Orthodox,175.26,182.88,155,29,32,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.250000,0.000000,-1.570000,1,1,neither,KO/TKO,Punch,1.0,1:20,80.0,175.0,900.0,500.0,3500.0,110.0,1100.0,,
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,MALE,3,0,2,0,5.220000,0.56,0.000000,2.540000,0.39,2,3,19,0,0,0,4,1,0,0,5,Orthodox,175.26,172.72,145,0,4,0,2.640000,0.620000,0.600000,2.850000,0.520,4,0,8,0,0,0,2,0,2,0,4,Orthodox,175.26,177.80,155,28,33,0,-2,-2,1,3,11,0,1,-2,0.00,-5.08,5,2.580000,-0.600000,-0.310000,1,1,neither,S-DEC,,3.0,5:00,900.0,165.0,200.0,400.0,1200.0,900.0,600.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,Duane Ludwig,Darren Elkins,-155.0,135,64.516129,135.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Blue,False,Lightweight,MALE,3,0,0,0,,,,,,0,0,0,0,0,0,0,0,0,0,0,Orthodox,177.80,180.34,145,1,0,0,13.666667,0.576667,0.000000,0.000000,0.000,2,1,5,0,0,0,1,1,0,0,2,Orthodox,177.80,177.80,170,31,25,1,0,-2,-2,1,-5,0,-1,0,0.00,2.54,6,-13.666667,0.000000,0.000000,0,1,neither,KO/TKO,,1.0,0:44,44.0,,,,,,,,
4892,John Howard,Daniel Roberts,-210.0,175,47.619048,175.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,0,0,0,,,,,,0,0,0,0,0,0,0,0,0,0,0,Southpaw,177.80,187.96,170,0,3,0,18.000000,0.550000,1.000000,4.666667,0.790,3,0,9,0,0,2,0,1,0,0,3,Orthodox,170.18,180.34,170,27,29,0,-3,-3,-3,0,-9,0,-1,0,7.62,7.62,-2,-18.000000,-1.000000,-4.666667,0,1,neither,KO/TKO,Punch,1.0,2:01,121.0,,,,,,,,
4893,Brendan Schaub,Chase Gormley,-260.0,220,38.461538,220.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Heavyweight,MALE,3,1,0,0,8.000000,0.34,1.000000,1.000000,1.00,0,1,1,0,0,0,0,0,0,0,0,Orthodox,190.50,196.00,265,1,0,0,12.000000,0.250000,0.000000,0.000000,0.000,0,1,1,1,0,0,0,0,0,0,0,Orthodox,193.04,198.12,245,27,27,0,0,0,0,0,0,-1,0,0,-2.54,-2.12,0,-4.000000,1.000000,1.000000,0,1,neither,KO/TKO,Punches,1.0,0:47,47.0,,,,,,,,
4894,Mike Pierce,Julio Paulino,-420.0,335,23.809524,335.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,0,0,0,,,,,,0,0,0,0,0,0,0,0,0,0,0,Orthodox,182.88,185.42,170,1,0,0,40.500000,0.405000,0.000000,3.500000,0.520,1,1,6,0,0,0,1,0,0,0,1,Orthodox,172.72,177.80,170,29,34,1,0,-1,-1,1,-6,0,0,0,10.16,7.62,-5,-40.500000,0.000000,-3.500000,0,1,neither,U-DEC,,3.0,5:00,900.0,,,,,,,,


We can separate our feature columns into categorical (date, fighter stance, finishing detail,...) and numerical (rank, time, odds, strike percentage,...)

In [6]:
categorical_features = [col for col in df.columns if df[col].dtypes == 'object']
numeric_features = [col for col in df.columns if col not in categorical_features]
print(f"Categorical feature: {categorical_features}")
print(f"Numerical feature: {numeric_features}")
#TODO check features (title_bout)


Categorical feature: ['R_fighter', 'B_fighter', 'date', 'location', 'country', 'Winner', 'weight_class', 'gender', 'B_Stance', 'R_Stance', 'better_rank', 'finish', 'finish_details', 'finish_round_time']
Numerical feature: ['R_odds', 'B_odds', 'R_ev', 'B_ev', 'title_bout', 'no_of_rounds', 'B_current_lose_streak', 'B_current_win_streak', 'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses', 'B_total_rounds_fought', 'B_total_title_bouts', 'B_win_by_Decision_Majority', 'B_win_by_Decision_Split', 'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission', 'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Height_cms', 'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak', 'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed', 'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct', 'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought', 'R_total_title_bouts', 'R_win_by_De

This is a classification problem, we want to predict the winner so we get our label values:

In [7]:
## Get label
label = df.Winner
label = [1 if win == 'Red' else 0 if win == 'Blue' else np.nan for win in label]

We split our data into training and testing set

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(df, label, random_state = 2, test_size = 0.3)

## Feature Engineering

Let's create new feature.
We can see that some feature are already combinaison of others (`height_dif`, `age_dif`), but some are not (`B_avg_SIG_STR_pct` and `R_avg_SIG_STR_pct`, similar for odds,ev,...) 



In [9]:
# We create new difference columns

df['odds_diff'] = (df['B_odds']-df['R_odds'])
df['ev_diff'] = (df['B_ev']-df['R_ev'])
df['draw_diff'] = (df['B_draw']-df['R_draw'])
df['avg_sig_str_pct_diff'] = (df['B_avg_SIG_STR_pct']-df['R_avg_SIG_STR_pct'])
df['avg_TD_pct_diff'] = (df['B_avg_TD_pct']-df['B_avg_TD_pct'])
df['win_by_Decision_Majority_diff'] = (df['B_win_by_Decision_Majority']-df['R_win_by_Decision_Majority'])
df['win_by_Decision_Split_diff'] = (df['B_win_by_Decision_Split']-df['R_win_by_Decision_Split'])
df['win_by_Decision_Unanimous_diff'] = (df['B_win_by_Decision_Unanimous']-df['R_win_by_Decision_Unanimous'])
df['win_by_TKO_Doctor_Stoppage_diff'] = (df['B_win_by_TKO_Doctor_Stoppage']-df['R_win_by_TKO_Doctor_Stoppage'])
df['current_lose_streak_diff'] = df['B_current_lose_streak'] - df['R_current_lose_streak'] 
df['current_win_streak_diff'] = df['B_current_win_streak'] - df['R_current_win_streak'] 
df['longest_win_streak_diff'] = df['B_longest_win_streak'] - df['R_longest_win_streak'] 
df['wins_diff'] = df['B_wins'] - df['R_wins'] 
df['losses_diff'] = df['B_losses'] - df['R_losses']
df['rounds_founght_diff'] = df['B_total_rounds_fought'] - df['R_total_rounds_fought'] 
df['total_title_bouts_diff'] = df['B_total_title_bouts'] - df['R_total_title_bouts'] 
df['dec_odds_diff'] = df['b_dec_odds'] - df['r_dec_odds']
df['sub_odds_diff'] = df['b_sub_odds'] - df['r_sub_odds']
df['ko_odds_diff'] = df['b_ko_odds'] - df['r_ko_odds']
df['dec_odds_diff'] = df['b_dec_odds'] - df['r_dec_odds']
df['rank_diff'] = df['B_rank'] - df['R_rank']
df['weight_diff'] = df['B_Weight_lbs'] - df['R_Weight_lbs']

df['avg_sig_str_landed_diff'] = df['B_avg_SIG_STR_landed'] - df['R_avg_SIG_STR_landed']

df['avg_td_landed_diff'] = df['B_avg_TD_landed'] - df['R_avg_TD_landed']

df['win_by_KO/TKO_diff'] = df['B_win_by_KO/TKO'] - df['R_win_by_KO/TKO']
df['win_by_Submission_diff'] = df['B_win_by_Submission'] - df['R_win_by_Submission']





In [10]:
# We can now remove all those columns

var_drop = [
'B_odds',
'R_odds',
'B_ev',
'R_ev',
'B_current_lose_streak', 'R_current_lose_streak',
'B_current_win_streak', 'R_current_win_streak',
'B_longest_win_streak', 'R_longest_win_streak',
'B_wins', 'R_wins',
'B_losses', 'R_losses',
'B_total_rounds_fought', 'R_total_rounds_fought',
'B_total_title_bouts', 'R_total_title_bouts',
'B_win_by_KO/TKO', 'R_win_by_KO/TKO',
'B_win_by_Submission', 'R_win_by_Submission',
'B_Height_cms', 'R_Height_cms',
'B_Reach_cms', 'R_Reach_cms',
'B_age', 'R_age',
'B_avg_SIG_STR_landed', 'R_avg_SIG_STR_landed',
'B_avg_SUB_ATT', 'R_avg_SUB_ATT',
'B_avg_TD_landed', 'R_avg_TD_landed',
'B_draw',
'B_avg_SIG_STR_pct',
'B_avg_TD_pct',
'B_win_by_Decision_Majority',
'B_win_by_Decision_Split',
'B_win_by_Decision_Unanimous',
'B_win_by_TKO_Doctor_Stoppage',
'R_draw',
'R_avg_SIG_STR_pct',
'R_avg_TD_pct',
'R_win_by_Decision_Majority',
'R_win_by_Decision_Split',
'R_win_by_Decision_Unanimous',
'R_win_by_TKO_Doctor_Stoppage',
'r_dec_odds', 'b_dec_odds',
'r_sub_odds','b_sub_odds',
'r_ko_odds','b_ko_odds',
'R_rank', 'B_rank',
'R_Weight_lbs', 'B_Weight_lbs'
]
df = df.drop(var_drop, axis=1)

Let's take a look at our new dataset

In [11]:
print(df.shape)
df

(4896, 60)


Unnamed: 0,R_fighter,B_fighter,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_Stance,R_Stance,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,odds_diff,ev_diff,draw_diff,avg_sig_str_pct_diff,avg_TD_pct_diff,win_by_Decision_Majority_diff,win_by_Decision_Split_diff,win_by_Decision_Unanimous_diff,win_by_TKO_Doctor_Stoppage_diff,current_lose_streak_diff,current_win_streak_diff,longest_win_streak_diff,wins_diff,losses_diff,rounds_founght_diff,total_title_bouts_diff,dec_odds_diff,sub_odds_diff,ko_odds_diff,rank_diff,weight_diff,avg_sig_str_landed_diff,avg_td_landed_diff,win_by_KO/TKO_diff,win_by_Submission_diff
0,Thiago Santos,Johnny Walker,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Light Heavyweight,MALE,5,Orthodox,Orthodox,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.530000,0.600000,-0.370000,1,1,Red,U-DEC,,5.0,5:00,1500.0,280.0,63.333333,0,0.110000,0.0,0,0,-1,0,-3,1,0,-8,-6,-32,-1,100.0,-400.0,285.0,5.0,0,-0.53,-0.370000,-7,0
1,Alex Oliveira,Niko Price,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Welterweight,MALE,3,Orthodox,Orthodox,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.00,-1,2.190000,0.300000,-1.480000,1,1,neither,U-DEC,,3.0,5:00,900.0,-370.0,-120.000000,0,-0.090000,0.0,0,-1,-3,0,0,0,-2,-5,-3,-20,0,-100.0,400.0,-430.0,,0,2.19,-1.480000,0,-1
2,Misha Cirkunov,Krzysztof Jotko,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Middleweight,MALE,3,Southpaw,Orthodox,0,0,1,3,1,25,0,0,-5,-5.08,0.00,-2,-0.850000,-1.600000,-3.330000,1,1,neither,S-DEC,,3.0,5:00,900.0,-240.0,-33.076923,0,-0.080000,0.0,0,2,6,0,0,0,1,3,1,25,0,-275.0,1125.0,-415.0,,-20,-0.85,-3.330000,0,-5
3,Alexander Hernandez,Mike Breeden,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Lightweight,MALE,3,Orthodox,Orthodox,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.250000,0.000000,-1.570000,1,1,neither,KO/TKO,Punch,1.0,1:20,80.0,1150.0,460.185185,0,-0.020000,0.0,0,0,-2,0,0,0,-2,-4,-2,-12,0,725.0,3000.0,990.0,,0,0.25,-1.570000,-2,0
4,Joe Solecki,Jared Gordon,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,MALE,3,Orthodox,Orthodox,0,-2,-2,1,3,11,0,1,-2,0.00,-5.08,5,2.580000,-0.600000,-0.310000,1,1,neither,S-DEC,,3.0,5:00,900.0,250.0,40.925926,0,-0.060000,0.0,0,0,2,0,0,-2,-2,1,3,11,0,35.0,800.0,-300.0,,-10,2.58,-0.310000,1,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,Duane Ludwig,Darren Elkins,3/21/2010,"Broomfield, Colorado, USA",USA,Blue,False,Lightweight,MALE,3,Orthodox,Orthodox,1,0,-2,-2,1,-5,0,-1,0,0.00,2.54,6,-13.666667,0.000000,0.000000,0,1,neither,KO/TKO,,1.0,0:44,44.0,290.0,70.483871,0,,,0,0,-1,0,-1,0,-2,-2,-1,-5,0,,,,,-25,,,-1,0
4892,John Howard,Daniel Roberts,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,Southpaw,Orthodox,0,-3,-3,-3,0,-9,0,-1,0,7.62,7.62,-2,-18.000000,-1.000000,-4.666667,0,1,neither,KO/TKO,Punch,1.0,2:01,121.0,385.0,127.380952,0,,,0,-2,0,0,0,-3,-3,-3,0,-9,0,,,,,0,,,-1,0
4893,Brendan Schaub,Chase Gormley,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Heavyweight,MALE,3,Orthodox,Orthodox,0,0,0,0,0,0,-1,0,0,-2.54,-2.12,0,-4.000000,1.000000,1.000000,0,1,neither,KO/TKO,Punches,1.0,0:47,47.0,480.0,181.538462,0,0.090000,0.0,0,0,0,0,0,0,0,0,0,0,-1,,,,,20,-4.00,1.000000,0,0
4894,Mike Pierce,Julio Paulino,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,Orthodox,Orthodox,1,0,-1,-1,1,-6,0,0,0,10.16,7.62,-5,-40.500000,0.000000,-3.500000,0,1,neither,U-DEC,,3.0,5:00,900.0,755.0,311.190476,0,,,0,0,-1,0,-1,0,-1,-1,-1,-6,0,,,,,0,,,0,0


Next, we encode the Stance categorical feature

In [12]:
df.B_Stance.unique()

array(['Orthodox', 'Southpaw', 'Switch', nan, 'Switch ', 'Open Stance'],
      dtype=object)

Let's first correct the typo

In [13]:
# Fix a typo
df['B_Stance'].loc[df['B_Stance']=='Switch '] = 'Switch'

# Give a numerical value to each stance
for x in ['B_Stance', 'R_Stance']:
    df[x] = [2 if st == 'Orthodox'
                           else 3 if st == 'Southpaw'
                           else 4 if st == 'Switch'
                           else 1 for st in df[x]]

#using -1 and 1 for both red and blue so there is no misunderstanding that one variable is better than the other    
df['better_rank'] = [-1 if rank == 'Red'
                               else 1 if rank == 'Blue'
                               else 0 for rank in df['better_rank']]

df['title_bout'] = [1 if tb==True else 0 for tb in df['title_bout']]

df['Winner'] = [1 if winner == 'Red' else 0 for winner in df.Winner]

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,R_fighter,B_fighter,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_Stance,R_Stance,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,odds_diff,ev_diff,draw_diff,avg_sig_str_pct_diff,avg_TD_pct_diff,win_by_Decision_Majority_diff,win_by_Decision_Split_diff,win_by_Decision_Unanimous_diff,win_by_TKO_Doctor_Stoppage_diff,current_lose_streak_diff,current_win_streak_diff,longest_win_streak_diff,wins_diff,losses_diff,rounds_founght_diff,total_title_bouts_diff,dec_odds_diff,sub_odds_diff,ko_odds_diff,rank_diff,weight_diff,avg_sig_str_landed_diff,avg_td_landed_diff,win_by_KO/TKO_diff,win_by_Submission_diff
0,Thiago Santos,Johnny Walker,2021-10-02,"Las Vegas, Nevada, USA",USA,1,0,Light Heavyweight,MALE,5,2,2,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.530000,0.600000,-0.370000,1,1,-1,U-DEC,,5.0,5:00,1500.0,280.0,63.333333,0,0.110000,0.0,0,0,-1,0,-3,1,0,-8,-6,-32,-1,100.0,-400.0,285.0,5.0,0,-0.53,-0.370000,-7,0
1,Alex Oliveira,Niko Price,2021-10-02,"Las Vegas, Nevada, USA",USA,0,0,Welterweight,MALE,3,2,2,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.00,-1,2.190000,0.300000,-1.480000,1,1,0,U-DEC,,3.0,5:00,900.0,-370.0,-120.000000,0,-0.090000,0.0,0,-1,-3,0,0,0,-2,-5,-3,-20,0,-100.0,400.0,-430.0,,0,2.19,-1.480000,0,-1
2,Misha Cirkunov,Krzysztof Jotko,2021-10-02,"Las Vegas, Nevada, USA",USA,0,0,Middleweight,MALE,3,3,2,0,0,1,3,1,25,0,0,-5,-5.08,0.00,-2,-0.850000,-1.600000,-3.330000,1,1,0,S-DEC,,3.0,5:00,900.0,-240.0,-33.076923,0,-0.080000,0.0,0,2,6,0,0,0,1,3,1,25,0,-275.0,1125.0,-415.0,,-20,-0.85,-3.330000,0,-5
3,Alexander Hernandez,Mike Breeden,2021-10-02,"Las Vegas, Nevada, USA",USA,1,0,Lightweight,MALE,3,2,2,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.250000,0.000000,-1.570000,1,1,0,KO/TKO,Punch,1.0,1:20,80.0,1150.0,460.185185,0,-0.020000,0.0,0,0,-2,0,0,0,-2,-4,-2,-12,0,725.0,3000.0,990.0,,0,0.25,-1.570000,-2,0
4,Joe Solecki,Jared Gordon,2021-10-02,"Las Vegas, Nevada, USA",USA,0,0,Lightweight,MALE,3,2,2,0,-2,-2,1,3,11,0,1,-2,0.00,-5.08,5,2.580000,-0.600000,-0.310000,1,1,0,S-DEC,,3.0,5:00,900.0,250.0,40.925926,0,-0.060000,0.0,0,0,2,0,0,-2,-2,1,3,11,0,35.0,800.0,-300.0,,-10,2.58,-0.310000,1,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,Duane Ludwig,Darren Elkins,3/21/2010,"Broomfield, Colorado, USA",USA,0,0,Lightweight,MALE,3,2,2,1,0,-2,-2,1,-5,0,-1,0,0.00,2.54,6,-13.666667,0.000000,0.000000,0,1,0,KO/TKO,,1.0,0:44,44.0,290.0,70.483871,0,,,0,0,-1,0,-1,0,-2,-2,-1,-5,0,,,,,-25,,,-1,0
4892,John Howard,Daniel Roberts,3/21/2010,"Broomfield, Colorado, USA",USA,1,0,Welterweight,MALE,3,3,2,0,-3,-3,-3,0,-9,0,-1,0,7.62,7.62,-2,-18.000000,-1.000000,-4.666667,0,1,0,KO/TKO,Punch,1.0,2:01,121.0,385.0,127.380952,0,,,0,-2,0,0,0,-3,-3,-3,0,-9,0,,,,,0,,,-1,0
4893,Brendan Schaub,Chase Gormley,3/21/2010,"Broomfield, Colorado, USA",USA,1,0,Heavyweight,MALE,3,2,2,0,0,0,0,0,0,-1,0,0,-2.54,-2.12,0,-4.000000,1.000000,1.000000,0,1,0,KO/TKO,Punches,1.0,0:47,47.0,480.0,181.538462,0,0.090000,0.0,0,0,0,0,0,0,0,0,0,0,-1,,,,,20,-4.00,1.000000,0,0
4894,Mike Pierce,Julio Paulino,3/21/2010,"Broomfield, Colorado, USA",USA,1,0,Welterweight,MALE,3,2,2,1,0,-1,-1,1,-6,0,0,0,10.16,7.62,-5,-40.500000,0.000000,-3.500000,0,1,0,U-DEC,,3.0,5:00,900.0,755.0,311.190476,0,,,0,0,-1,0,-1,0,-1,-1,-1,-6,0,,,,,0,,,0,0


In [14]:
#Encoding the remaining categorical variables
cat_col = ['R_fighter', 'B_fighter', 'date', 'location', 'country', 'weight_class', 'gender', 'finish', 'finish_details']
enc = LabelEncoder()
for i in df[cat_col]:
    df[i] = enc.fit_transform(df[i])
    
df.drop(['finish_round_time'], axis=1, inplace = True) # we have a column with this value in seconds

# Our label
label = df.Winner
df.drop(['Winner'], axis=1, inplace = True)
df

Unnamed: 0,R_fighter,B_fighter,date,location,country,title_bout,weight_class,gender,no_of_rounds,B_Stance,R_Stance,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,better_rank,finish,finish_details,finish_round,total_fight_time_secs,odds_diff,ev_diff,draw_diff,avg_sig_str_pct_diff,avg_TD_pct_diff,win_by_Decision_Majority_diff,win_by_Decision_Split_diff,win_by_Decision_Unanimous_diff,win_by_TKO_Doctor_Stoppage_diff,current_lose_streak_diff,current_win_streak_diff,longest_win_streak_diff,wins_diff,losses_diff,rounds_founght_diff,total_title_bouts_diff,dec_odds_diff,sub_odds_diff,ko_odds_diff,rank_diff,weight_diff,avg_sig_str_landed_diff,avg_td_landed_diff,win_by_KO/TKO_diff,win_by_Submission_diff
0,1233,746,177,66,26,0,5,1,5,2,2,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.530000,0.600000,-0.370000,1,1,-1,6,31,5.0,1500.0,280.0,63.333333,0,0.110000,0.0,0,0,-1,0,-3,1,0,-8,-6,-32,-1,100.0,-400.0,285.0,5.0,0,-0.53,-0.370000,-7,0
1,42,1158,177,66,26,0,8,1,3,2,2,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.00,-1,2.190000,0.300000,-1.480000,1,1,0,6,31,3.0,900.0,-370.0,-120.000000,0,-0.090000,0.0,0,-1,-3,0,0,0,-2,-5,-3,-20,0,-100.0,400.0,-430.0,,0,2.19,-1.480000,0,-1
2,940,876,177,66,26,0,7,1,3,3,2,0,0,1,3,1,25,0,0,-5,-5.08,0.00,-2,-0.850000,-1.600000,-3.330000,1,1,0,4,31,3.0,900.0,-240.0,-33.076923,0,-0.080000,0.0,0,2,6,0,0,0,1,3,1,25,0,-275.0,1125.0,-415.0,,-20,-0.85,-3.330000,0,-5
3,48,1077,177,66,26,0,6,1,3,2,2,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.250000,0.000000,-1.570000,1,1,0,1,24,1.0,80.0,1150.0,460.185185,0,-0.020000,0.0,0,0,-2,0,0,0,-2,-4,-2,-12,0,725.0,3000.0,990.0,,0,0.25,-1.570000,-2,0
4,615,645,177,66,26,0,6,1,3,2,2,0,-2,-2,1,3,11,0,1,-2,0.00,-5.08,5,2.580000,-0.600000,-0.310000,1,1,0,4,31,3.0,900.0,250.0,40.925926,0,-0.060000,0.0,0,0,2,0,0,-2,-2,1,3,11,0,35.0,800.0,-300.0,,-10,2.58,-0.310000,1,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,373,369,194,26,21,0,6,1,3,2,2,1,0,-2,-2,1,-5,0,-1,0,0.00,2.54,6,-13.666667,0.000000,0.000000,0,1,0,1,31,1.0,44.0,290.0,70.483871,0,,,0,0,-1,0,-1,0,-2,-2,-1,-5,0,,,,,-25,,,-1,0
4892,626,349,194,26,21,0,8,1,3,3,2,0,-3,-3,-3,0,-9,0,-1,0,7.62,7.62,-2,-18.000000,-1.000000,-4.666667,0,1,0,1,24,1.0,121.0,385.0,127.380952,0,,,0,-2,0,0,0,-3,-3,-3,0,-9,0,,,,,0,,,-1,0
4893,163,248,194,26,21,0,4,1,3,2,2,0,0,0,0,0,0,-1,0,0,-2.54,-2.12,0,-4.000000,1.000000,1.000000,0,1,0,1,25,1.0,47.0,480.0,181.538462,0,0.090000,0.0,0,0,0,0,0,0,0,0,0,0,-1,,,,,20,-4.00,1.000000,0,0
4894,924,808,194,26,21,0,8,1,3,2,2,1,0,-1,-1,1,-6,0,0,0,10.16,7.62,-5,-40.500000,0.000000,-3.500000,0,1,0,6,31,3.0,900.0,755.0,311.190476,0,,,0,0,-1,0,-1,0,-1,-1,-1,-6,0,,,,,0,,,0,0


Splitting the data into training and testing set

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(df, label, test_size = 0.3, random_state=2)
print ('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_valid.shape[0]))


# We impute the missing value with the mean values for the columns
impute = SimpleImputer(strategy = 'mean')
impute.fit(X_train)
X_train = impute.transform(X_train)
X_valid = impute.transform(X_valid)

Training cases: 3427
Test cases: 1469


#### Model: RandomForrest Classifier

In [16]:
RF_model = RandomForestClassifier(random_state=2)
RF_model.fit(X_train, y_train)
print(RF_model)

RandomForestClassifier(random_state=2)


In [17]:
preds = RF_model.predict(X_valid)
predictions = RF_model.predict(X_valid)
print('Accuracy: ', accuracy_score(y_valid, predictions))

Accuracy:  0.6548672566371682


In [18]:
RF_model = RandomForestClassifier(n_estimators=350, max_depth=12, random_state=2)
RF_model.fit(X_train, y_train)
predictions = RF_model.predict(X_valid)
print('Accuracy: ', accuracy_score(y_valid, predictions))


Accuracy:  0.6623553437712729


#### Model: Logistic Regression

In [19]:
# Set regularization rate
reg = 0.01

# train a logistic regression model on the training set
LR_model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
print(LR_model)

LogisticRegression(C=100.0, solver='liblinear')


In [20]:
predictions = LR_model.predict(X_valid)
print('Accuracy: ', accuracy_score(y_valid, predictions))


Accuracy:  0.6555479918311776


We can see that with both a LogisticRegression model and a RandomForest model, we have an accuracy of 66% in our prediction.