In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
ufc_df = pd.read_csv('ufc_data.csv')
# Change columns name
ufc_df.rename(columns={
    "Fight Type": "fight_type",
    "Time:": "time",
    "Round:":"round",
    "Method:": "method",
    "Time format:": "time_format",
    "Referee:": "referee",
    "Details": "details",
    "Winner": "winner",
    "r_figher": "r_fighter",
    "b_figher": "r_fighter",
    "r_figher_sig_str": "r_fighter_ss",
    "b_figher_sig_str": "b_fighter_ss",
    "r_figher_sig_str %":"r_fighter_ss_pct",
    "b_figher_sig_str %": "b_fighter_ss_pct",
    "r_figher_total_str": "r_fighter_total_str",
    "b_figher_total_str": "b_fighter_total_str"
}, inplace=True)
ufc_df.head()

Unnamed: 0,fight_type,method,round,time,time_format,referee,details,winner,r_fighter,r_fighter.1,r_fighter_ss,b_fighter_ss,r_fighter_ss_pct,b_fighter_ss_pct,r_fighter_total_str,b_fighter_total_str
0,Light Heavyweight Bout,KO/TKO,2,4:29,5 Rnd (5-5-5-5-5),Herb Dean,Elbow to Head At Distance\n Spinning Back...,Jiri Prochazka,Dominick Reyes,Jiri Prochazka,63 of 108,77 of 136,58%,56%,68 of 114,78 of 137
1,Featherweight Bout,KO/TKO,1,1:03,3 Rnd (5-5-5),Jason Herzog,Kick to Body At Distance,Giga Chikadze,Giga Chikadze,Cub Swanson,12 of 18,4 of 6,66%,66%,12 of 18,4 of 6
2,Light Heavyweight Bout,Decision - Split,3,5:00,3 Rnd (5-5-5),Mark Smith,Dave Hagen28 - 29.Rick Winter29 - 28.Tony Week...,draw,Ion Cutelaba,Dustin Jacoby,71 of 152,84 of 148,46%,56%,111 of 201,100 of 164
3,Middleweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Herb Dean,Sal D'amato27 - 30.Junichiro Kamijo27 - 30.Ada...,Sean Strickland,Sean Strickland,Krzysztof Jotko,84 of 182,37 of 196,46%,18%,84 of 182,37 of 196
4,Bantamweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Chris Tognoni,Mike Bell27 - 30.Sal D'amato28 - 29.Rick Winte...,Merab Dvalishvili,Merab Dvalishvili,Cody Stamann,71 of 192,48 of 134,36%,35%,178 of 306,73 of 163


In [34]:
# Work on copied data
df = ufc_df.copy()

## Data cleaning and preprocessing
- convert b_fighter_sig_str % and b_fighter_sig_str % columns to int

In [35]:
# convert object to number
def to_int(number):
    if number == "---":
        return pd.to_numeric(0)
    else:
        return pd.to_numeric(number.replace("%", ""))

# convert first number of siginificant strike and total fight
def get_first_number(number):
    first_num = number.split('of')[0]
    return to_int(first_num)

In [36]:
# convert to int and datetime
df['b_fighter_ss_pct'] = df['b_fighter_ss_pct'].apply(lambda x: to_int(x))
df['r_fighter_ss_pct'] = df['r_fighter_ss_pct'].apply(lambda x: to_int(x))
df['time'] = pd.to_datetime(df['time']).dt.time

# keep only the first number of these columns and convert them to int
columns = ['r_fighter_ss', 'b_fighter_ss', 'r_fighter_total_str', 'b_fighter_total_str']
for column in columns:
    df[column] = df[column].apply(lambda x: get_first_number(x))
df.head()

Unnamed: 0,fight_type,method,round,time,time_format,referee,details,winner,r_fighter,r_fighter.1,r_fighter_ss,b_fighter_ss,r_fighter_ss_pct,b_fighter_ss_pct,r_fighter_total_str,b_fighter_total_str
0,Light Heavyweight Bout,KO/TKO,2,04:29:00,5 Rnd (5-5-5-5-5),Herb Dean,Elbow to Head At Distance\n Spinning Back...,Jiri Prochazka,Dominick Reyes,Jiri Prochazka,63,77,58,56,68,78
1,Featherweight Bout,KO/TKO,1,01:03:00,3 Rnd (5-5-5),Jason Herzog,Kick to Body At Distance,Giga Chikadze,Giga Chikadze,Cub Swanson,12,4,66,66,12,4
2,Light Heavyweight Bout,Decision - Split,3,05:00:00,3 Rnd (5-5-5),Mark Smith,Dave Hagen28 - 29.Rick Winter29 - 28.Tony Week...,draw,Ion Cutelaba,Dustin Jacoby,71,84,46,56,111,100
3,Middleweight Bout,Decision - Unanimous,3,05:00:00,3 Rnd (5-5-5),Herb Dean,Sal D'amato27 - 30.Junichiro Kamijo27 - 30.Ada...,Sean Strickland,Sean Strickland,Krzysztof Jotko,84,37,46,18,84,37
4,Bantamweight Bout,Decision - Unanimous,3,05:00:00,3 Rnd (5-5-5),Chris Tognoni,Mike Bell27 - 30.Sal D'amato28 - 29.Rick Winte...,Merab Dvalishvili,Merab Dvalishvili,Cody Stamann,71,48,36,35,178,73


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   fight_type           571 non-null    object
 1   method               571 non-null    object
 2   round                571 non-null    int64 
 3   time                 571 non-null    object
 4   time_format          571 non-null    object
 5   referee              564 non-null    object
 6   details              568 non-null    object
 7   winner               571 non-null    object
 8   r_fighter            571 non-null    object
 9   r_fighter            571 non-null    object
 10  r_fighter_ss         571 non-null    int64 
 11  b_fighter_ss         571 non-null    int64 
 12  r_fighter_ss_pct     571 non-null    int64 
 13  b_fighter_ss_pct     571 non-null    int64 
 14  r_fighter_total_str  571 non-null    int64 
 15  b_fighter_total_str  571 non-null    int64 
dtypes: int64

In [40]:
df.describe()

Unnamed: 0,round,r_fighter_ss,b_fighter_ss,r_fighter_ss_pct,b_fighter_ss_pct,r_fighter_total_str,b_fighter_total_str
count,571.0,571.0,571.0,571.0,571.0,571.0,571.0
mean,2.458844,46.588441,42.120841,49.812609,47.93345,65.751313,58.544658
std,1.057789,39.986232,34.686179,15.007422,16.170336,51.987831,45.053684
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,17.0,16.0,41.0,38.0,25.5,23.0
50%,3.0,39.0,35.0,49.0,48.0,58.0,50.0
75%,3.0,66.0,58.0,58.0,56.0,93.0,85.5
max,5.0,445.0,189.0,100.0,100.0,447.0,215.0
