# Pre-processing fighter statistics

In [1]:
# import

import numpy as np
from numpy import NaN
import pandas as pd
import datetime as DT
import io
import re
from distutils.log import error


In [2]:
# load csv file

df = pd.read_csv('fighter_scraper/fighterstemp.csv')

In [3]:
df=df.drop_duplicates()

In [4]:
df.dtypes

name           object
win             int64
loss            int64
tie            object
height_ft      object
weight_lbs     object
reach_inch     object
stance         object
dob            object
slpm          float64
str_acc        object
sapm          float64
str_def        object
tdavg         float64
tdacc          object
tddef          object
subavg        float64
dtype: object

In [5]:
df.isna().sum()

name            0
win             0
loss            0
tie             0
height_ft       0
weight_lbs      0
reach_inch      0
stance        306
dob             0
slpm            0
str_acc         0
sapm            0
str_def         0
tdavg           0
tdacc           0
tddef           0
subavg          0
dtype: int64

In [6]:
# feature engineering

df['w/l_ratio'] = (df['win'] / (df['loss'] + df['win'])).round(2)

In [7]:
#get age

now = pd.Timestamp('now')

df['dob'] = [x.replace(",", "") for x in df['dob']]

df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

df['dob'] = df['dob'].where(df['dob'] < now, df['dob'] -  np.timedelta64(100, 'Y'))
df['age'] = (now - df['dob']).astype('<m8[Y]')
df=df.drop(['dob'], axis=1)


In [8]:
#feet to inches

# the algorithm I will use late will not understand the concept of feet and inches need to convert to incremental data format
r = re.compile(r"([0-9]+)' ([0-9]*\.?[0-9]+)\"")
def get_inches(el):
    m = r.match(el)
    if m == None:
        return float('NaN')
    else:
        return int(m.group(1))*12 + float(m.group(2))
    

#run function to create new column
df['inches'] = df['height_ft'].apply(get_inches)

#drop old column

df = df.drop(['height_ft'], axis=1)

In [9]:
# get rid of % 
# do this is scraping if can 

df['str_acc'] = [x.replace("%", "") for x in df['str_acc']]
df['str_def'] = [x.replace("%", "") for x in df['str_def']]
df['tdacc'] = [x.replace("%", "") for x in df['tdacc']]
df['tddef'] = [x.replace("%", "") for x in df['tddef']]

In [10]:
# converting to proper data types

df = df.convert_dtypes()

df.dtypes # the way the data was scraped some nulls were scraped as strings

name           string
win             Int64
loss            Int64
tie            string
weight_lbs     string
reach_inch     string
stance         string
slpm          Float64
str_acc        string
sapm          Float64
str_def        string
tdavg         Float64
tdacc          string
tddef          string
subavg        Float64
w/l_ratio     Float64
age             Int64
inches          Int64
dtype: object

In [11]:
# replace scraped nulls with NaN

df=df.apply(lambda x: x.replace('--', NaN))

In [13]:
# function converts string data types into numeric data types

def numeric_func(x):
    for col in x.columns:
        try:
            x[col] = pd.to_numeric(x[col])
        except:
            pass

In [14]:
numeric_func(df)

In [15]:
df.dtypes

name           string
win             Int64
loss            Int64
tie            string
weight_lbs    float64
reach_inch    float64
stance         string
slpm          Float64
str_acc         int64
sapm          Float64
str_def         int64
tdavg         Float64
tdacc           int64
tddef           int64
subavg        Float64
w/l_ratio     Float64
age             Int64
inches          Int64
dtype: object

In [16]:

df['weight_lbs'] = pd.to_numeric(df['weight_lbs'])
df['reach_inch'] = pd.to_numeric(df['reach_inch'])



In [17]:
#get dummies for categorical variables

#might need to do this eventually if I want to use a type of machine learnng algorithm

pd.get_dummies(df['stance'], prefix='stance')

Unnamed: 0,stance_Open Stance,stance_Orthodox,stance_Sideways,stance_Southpaw,stance_Switch
0,0,1,0,0,0
18,0,1,0,0,0
21,0,1,0,0,0
36,0,1,0,0,0
48,0,1,0,0,0
...,...,...,...,...,...
53585,0,1,0,0,0
53618,0,1,0,0,0
53627,0,1,0,0,0
53642,0,1,0,0,0


In [33]:
#counting the 'ties'

df.groupby('tie').count()

Unnamed: 0_level_0,name,win,loss,weight_lbs,reach_inch,stance,slpm,str_acc,sapm,str_def,tdavg,tdacc,tddef,subavg,w/l_ratio,age,inches
tie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,2251,2251,2251,2219,1324,2005,2251,2251,2251,2251,2251,2251,2251,2251,2250,1919,2139
0 (1 NC),277,277,277,277,193,265,277,277,277,277,277,277,277,277,277,260,271
0 (2 NC),27,27,27,27,19,26,27,27,27,27,27,27,27,27,27,26,27
0 (3 NC),4,4,4,4,2,4,4,4,4,4,4,4,4,4,4,4,4
0 (4 NC),1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
1,321,321,321,319,157,295,321,321,321,321,321,321,321,321,321,284,315
1 (1 NC),61,61,61,61,33,53,61,61,61,61,61,61,61,61,61,46,60
1 (2 NC),14,14,14,14,8,13,14,14,14,14,14,14,14,14,14,12,14
10,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2
10 (1 NC),1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1


In [37]:
# a tie is a true tie, an NC is a no contest and the fight is stopped

def no_no_contest(x):
    rgx='(\d+)'
    return re.findall(rgx, x)[0]

In [21]:
text = '4 (1 NC)'

In [28]:
import re

In [36]:
rgx='(\d+)'

re.findall(rgx, text)[0]

'4'

In [None]:
data.to_csv('fighters--cleaned.csv', index=False)

NameError: name 'df' is not defined