# Import libraries

In [2]:
import numpy as np
import pandas as pd

import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from tqdm import tqdm
from time import sleep

import random
import pickle
import datetime

import psycopg2 as pg2

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

We want an efficient way to update and ultimately automate appending new data into our table.

Ufc events usually happen every week.

A way to check for new athletes and fights that happened and update our table is to check if an event happaned,

Then if it did scrape tha names of the athletes participated.

if some of those names are not in our list we should update that list, and scrape statistics.

Then for the rest of the athletes we should scrape for only those fights

# Find the last events scraped so far

In [9]:
conn = pg2.connect(host="localhost", database='PostgreSQL_mma', user='postgres', password='password')


df = pd.read_sql_query("""
SELECT DISTINCT date, event
FROM mma
ORDER BY date DESC
LIMIT 5

""", conn)

conn.close()

df

Unnamed: 0,date,event
0,2023-02-04,UFC Fight Night
1,2023-01-21,UFC 283
2,2023-01-14,UFC Fight Night
3,2022-12-17,UFC Fight Night
4,2022-12-10,UFC 282


We want to scrape from UFC 284 MAKHACHEV VS VOLKANOVSKI Sun, Feb 12 and after (right now 7 events total)

Our way will be for the latest event so we can automate it later without much change

When thats done we can scrape the other fights too

In [10]:
'''
We want to open the page, deny cookies, click past events and get what events we have,
We dont need to interact with the load more button.

'''

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # Open chrome

driver.get("https://www.ufc.com/events") # Go to the webpage

driver.find_element(By.XPATH, '//*[@id="onetrust-reject-all-handler"]').click() # Deny cookies

sleep(1 + random.random())

driver.find_element(By.XPATH, '//*[@id="block-mainpagecontent"]/div/div/div[5]/div/ul/li[2]/a/strong').click() # Past events

# Get the page source and create a BeautifulSoup object
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")

# Find all anchor tags with a "Recap" button
recap_buttons = soup.find_all("a", class_="e-button--white")
recap_links = [button.get("href") for button in recap_buttons if "Recap" in button.get_text()]
recap_links = ['https://www.ufc.com' + link for link in recap_links]

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.79M/6.79M [00:04<00:00, 1.59MB/s]


In [11]:
recap_links

['https://www.ufc.com/event/ufc-fight-night-march-25-2023#1133',
 'https://www.ufc.com/event/ufc-286#1132',
 'https://www.ufc.com/event/ufc-fight-night-march-11-2023#1131',
 'https://www.ufc.com/event/ufc-285#1130',
 'https://www.ufc.com/event/ufc-fight-night-february-25-2023#1129',
 'https://www.ufc.com/event/ufc-fight-night-february-18-2023#1128',
 'https://www.ufc.com/event/ufc-284#1126',
 'https://www.ufc.com/event/ufc-fight-night-february-04-2023#1134']

## For multiple events we run this

#### We care for athletes names and from them we can create their ulrs as there are some mistakes on the webpage on the links

In [15]:
fought_recently = []

for rec_link in recap_links:

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # Open chrome

    driver.get(rec_link) # Go to the webpage

    driver.find_element(By.XPATH, '//*[@id="onetrust-reject-all-handler"]').click() # Deny cookies

    sleep(1 + random.random())

    # Get the page source and create a BeautifulSoup object
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")


    # Returns a elements which have '/athlete/' in them and also span elements for given and family name
    a_elements = soup.select('a[href*="/athlete/"] span.c-listing-fight__corner-given-name, a[href*="/athlete/"] span.c-listing-fight__corner-family-name')

    # Given and family name span elements
    temp_list = [elem.text for elem in a_elements]



    '''
    The athletes which have more than one family or given name have them in the same span
    So we can use the line below to get pairs of Given and Family name

    '''
    # Use a list comprehension to concatenate each pair of elements into a single string
    temp_list = [temp_list[i] + ' ' + temp_list[i+1] for i in range(0, len(temp_list), 2)]
    
    # Append those names to our final list
    fought_recently += temp_list
    
    sleep(3 + random.random())
    
fought_recently = list(set(fought_recently))

driver.close()

In [16]:
fought_recently

['Yana Santos',
 'Blagoy Ivanov',
 'Trevin Giles',
 'Mana Martinez',
 'Trevor Peek',
 'Jared Gooden',
 'Geoff Neal',
 'CJ Vergara',
 'Marc-Andre Barriault',
 'Francisco Prado',
 'AJ Fletcher',
 'Toshiomi Kazama',
 'Farid Basharat',
 'Jordan Leavitt',
 'Nurullo Aliev',
 'Jack Shore',
 "Ode' Osbourne",
 'Steven Peterson',
 'Elves Brener',
 'Serghei Spivac',
 'Jamie Pickett',
 'Holly Holm',
 'Said Nurmagomedov',
 'Lucas Alexander',
 'Valentina Shevchenko',
 'Joanne Wood',
 'Kyle Nelson',
 'Cameron Saaiman',
 'Trevin Jones',
 "Da'Mon Blackshear",
 'Erin Blanchfield',
 'Randy Brown',
 'JeongYeong Lee',
 'Vitor Petrino',
 'Tony Gravely',
 'Yusaku Kinoshita',
 'Vinicius Salvador',
 'Guido Cannetti',
 'Jennifer Maia',
 'Jasmine Jasudavicius',
 'Dooho Choi',
 'Lerone Murphy',
 'Alexander Volkov',
 "Don'Tale Mayes",
 'Lina Lansberg',
 'Justin Tafa',
 'Nate Landwehr',
 'Jeka Saragih',
 'Marcin Prachnio',
 'Josh Fremd',
 'Nikita Krylov',
 'Jafel Filho',
 'Julian Marquez',
 'Jordan Wright',
 'Clayt

# Write a script for new additions

### Now we can check if someone is new here based on our first list we scraped in the previews notebook

In [13]:
with open("D:/data_projects/mma/updates/athletes_names_list.pickle", "rb") as file:
    names = pickle.load(file)
    
    
# Names that are in the fought recently list but not in the fought past list

new_names = list(set(fought_recently).difference(names))

In [3]:
if len(new_names) != 0: # If the list is not empty
    names += new_names # Append the new names
    
    # Overwrite the updated names list to the updates folder
    with open("D:/data_projects/mma/updates/athletes_names_list.pickle", "wb") as file:
        pickle.dump(names, file)
     
    ######################################################################################################################
  
    ######################################################################################################################
    
    # Espn  
    try:
        new_fighters_espn_urls = []

        for name in tqdm(new_names):

            # Go to the url based on alphabet letter
            main_link = 'http://www.espn.com/mma/fighters?search=' + name.split()[-1].lower()[0]
            headers = {'User-agent': ''}
            response = requests.get(main_link, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')


            temp_list = [url for url in soup.find_all('a') if url is not None]
            temp_list = ['https://www.espn.com' + url.get('href') for url in temp_list if '/mma/fighter/_/id/' in url.get('href')]

            temp_names = ['-'.join(name.lower().split()) for name in new_names]

            temp_links = [link for link in temp_list if any(name in link for name in temp_names)]

            new_fighters_espn_urls += temp_links
    
        with open("D:/data_projects/mma/updates/athletes_espn_url.pickle", "rb") as file:
            temp_espn_url = pickle.load(file)
            
        temp_espn_url += new_fighters_espn_urls
        
        with open("D:/data_projects/mma/updates/athletes_espn_url.pickle", "wb") as file:
            pickle.dump(temp_espn_url, file)
    except:
        pass 

## Create links for those who fought recently

In [37]:
fought_recently

['Yana Santos',
 'Blagoy Ivanov',
 'Trevin Giles',
 'Mana Martinez',
 'Trevor Peek',
 'Jared Gooden',
 'Geoff Neal',
 'CJ Vergara',
 'Marc-Andre Barriault',
 'Francisco Prado',
 'AJ Fletcher',
 'Toshiomi Kazama',
 'Farid Basharat',
 'Jordan Leavitt',
 'Nurullo Aliev',
 'Jack Shore',
 "Ode' Osbourne",
 'Steven Peterson',
 'Elves Brener',
 'Serghei Spivac',
 'Jamie Pickett',
 'Holly Holm',
 'Said Nurmagomedov',
 'Lucas Alexander',
 'Valentina Shevchenko',
 'Joanne Wood',
 'Kyle Nelson',
 'Cameron Saaiman',
 'Trevin Jones',
 "Da'Mon Blackshear",
 'Erin Blanchfield',
 'Randy Brown',
 'JeongYeong Lee',
 'Vitor Petrino',
 'Tony Gravely',
 'Yusaku Kinoshita',
 'Vinicius Salvador',
 'Guido Cannetti',
 'Jennifer Maia',
 'Jasmine Jasudavicius',
 'Dooho Choi',
 'Lerone Murphy',
 'Alexander Volkov',
 "Don'Tale Mayes",
 'Lina Lansberg',
 'Justin Tafa',
 'Nate Landwehr',
 'Jeka Saragih',
 'Marcin Prachnio',
 'Josh Fremd',
 'Nikita Krylov',
 'Jafel Filho',
 'Julian Marquez',
 'Jordan Wright',
 'Clayt

In [75]:
# Espn has the non static data which are the ones we will scrape frequently

with open("D:/data_projects/mma/updates/athletes_espn_url.pickle", "rb") as file:
    espn_url = pickle.load(file)
    
temp_list = ['-'.join(name.lower().split()) for name in fought_recently]

recent_espn_url = [url for url in espn_url if any(name in url for name in temp_list)]

In [76]:
from UFC_functions import get_fighters_df

In [77]:
espn_df = pd.concat(list(map(get_fighters_df, tqdm(recent_espn_url))))

100%|████████████████████████████████████████████████████████████████████████████████| 176/176 [25:27<00:00,  8.68s/it]


In [78]:
espn_df = espn_df[espn_df['Event'].str.contains('UFC')]

### Lets do a test

In [79]:
# Like before there might be generated some columns like 'SDBL/A_x', 'SDBL/A_y' etc
# We want to find them if they exist, so i'll do a test for 'TSL-TSA' and then apply it for those we want

[col.split('-') for col in espn_df.columns if len(col.split('-')) > 1]

[['TSL', 'TSA'], ['TSL', 'TSA_x'], ['TSL', 'TSA_y']]

In [80]:
[col for col in espn_df.columns if (len(col.split('_')) > 1)]

['birth_date',
 'SDBL/A_x',
 'SDHL/A_x',
 'SDLL/A_x',
 'TSL_x',
 'TSA_x',
 'SSL_x',
 'SSA_x',
 'TSL-TSA_x',
 'KD_x',
 '%BODY_x',
 '%HEAD_x',
 '%LEG_x',
 'SDBL/A_y',
 'SDHL/A_y',
 'SDLL/A_y',
 'TSL_y',
 'TSA_y',
 'SSL_y',
 'SSA_y',
 'TSL-TSA_y',
 'KD_y',
 '%BODY_y',
 '%HEAD_y',
 '%LEG_y']

In [81]:
# Exclude birth_date

[col for col in espn_df.columns if (len(col.split('_')) > 1) and (len(col.split('_')[1]) == 1)]

['SDBL/A_x',
 'SDHL/A_x',
 'SDLL/A_x',
 'TSL_x',
 'TSA_x',
 'SSL_x',
 'SSA_x',
 'TSL-TSA_x',
 'KD_x',
 '%BODY_x',
 '%HEAD_x',
 '%LEG_x',
 'SDBL/A_y',
 'SDHL/A_y',
 'SDLL/A_y',
 'TSL_y',
 'TSA_y',
 'SSL_y',
 'SSA_y',
 'TSL-TSA_y',
 'KD_y',
 '%BODY_y',
 '%HEAD_y',
 '%LEG_y']

In [82]:
# We can write the same for those we want with the underscore and exclude the 'birth_date'

excessive_columns = [col for col in espn_df.columns if (len(col.split('_')) > 1) and (len(col.split('_')[1]) == 1)]

In [83]:
espn_df.columns

Index(['name', 'country', 'division', 'height', 'weight', 'birth_date',
       'stance', 'reach', 'Date', 'Opponent', 'Event', 'Res.', 'SDBL/A',
       'SDHL/A', 'SDLL/A', 'TSL', 'TSA', 'SSL', 'SSA', 'TSL-TSA', 'KD',
       '%BODY', '%HEAD', '%LEG', 'SCBL', 'SCBA', 'SCHL', 'SCHA', 'SCLL',
       'SCLA', 'RV', 'SR', 'TDL', 'TDA', 'TDS', 'TK ACC', 'SGBL', 'SGBA',
       'SGHL', 'SGHA', 'SGLL', 'SGLA', 'AD', 'ADTB', 'ADHG', 'ADTM', 'ADTS',
       'SM', 'Decision', 'Rnd', 'Time', 'SDBL/A_x', 'SDHL/A_x', 'SDLL/A_x',
       'TSL_x', 'TSA_x', 'SSL_x', 'SSA_x', 'TSL-TSA_x', 'KD_x', '%BODY_x',
       '%HEAD_x', '%LEG_x', 'SDBL/A_y', 'SDHL/A_y', 'SDLL/A_y', 'TSL_y',
       'TSA_y', 'SSL_y', 'SSA_y', 'TSL-TSA_y', 'KD_y', '%BODY_y', '%HEAD_y',
       '%LEG_y'],
      dtype='object')

In [84]:
# If more columns got generated than we wanted
if len(excessive_columns) != 0 :
    
    from UFC_functions import fixing_collumns
    
    for col_to_fix in excessive_columns:
    
        espn_df[col_to_fix.split('_')[0]] = fixing_collumns(col_to_fix.split('_')[0], col_to_fix, espn_df)

    # Drop the columns we dont need anymore
    espn_df.drop(excessive_columns, axis=1, inplace = True)

# Links and dataframe from official ufc site

In [85]:
ufc_url_list = ['https://www.ufc.com/athlete/' + '-'.join(name.lower().split()) for name in fought_recently]

In [86]:
from UFC_functions import get_stats_ufc

ufc_df = pd.concat(list(map(get_stats_ufc, tqdm(ufc_url_list))), ignore_index = True)

100%|████████████████████████████████████████████████████████████████████████████████| 175/175 [11:19<00:00,  3.89s/it]


In [87]:
ufc_df

Unnamed: 0,name,hometown,fighting_style,height,weight,reach,leg_reach
0,Yana Santos,Russia,MMA,66.00,146.00,68.50,40.00
1,Blagoy Ivanov,"Sofia, Bulgaria",MMA,71.00,256.50,73.00,42.00
2,Trevin Giles,"San Antonio, United States",Freestyle,72.00,185.00,74.00,41.00
3,Mana Martinez,"Houston, United States",Karate,70.00,150.20,70.00,41.00
4,Trevor Peek,"Scottsboro, United States",,69.00,155.50,70.00,41.50
...,...,...,...,...,...,...,...
170,Nazim Sadykhov,Russia,,70.00,155.50,69.00,41.00
171,Juancamilo Ronderos,"Bogota, Colombia",MMA,63.00,125.00,64.50,34.00
172,Kamaru Usman,"Auchi, Nigeria",Freestyle,72.00,190.00,76.00,41.00
173,Mike Malott,"Cleveland, United States",MMA,73.00,171.00,73.00,41.00


# Links and dataframe from ufcstats

In [88]:
# Get first character from last name for the first fighter
fought_recently[0].split()[-1].lower()[0]

's'

In [89]:
# Link for fighters which their lats names start with v
'http://ufcstats.com/statistics/fighters?char=' + fought_recently[0].split()[-1].lower()[0] + '&page=all'

'http://ufcstats.com/statistics/fighters?char=s&page=all'

### We use this to get their ulrs

In [90]:
ufcstats_list = []

for name in tqdm(fought_recently):

    # Go to the url based on alphabet letter
    main_link = 'http://ufcstats.com/statistics/fighters?char=' + name.split()[-1].lower()[0] + '&page=all'
    headers = {'User-agent': ''}
    response = requests.get(main_link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    try:
        # Find the rows which have names and links
        temp_list = ([row.find_all('a', {'class': 'b-link b-link_style_black'}) 
                      for row in soup.find_all('tr', {'class': 'b-statistics__table-row'})])
        # And are not empty
        temp_list = [sublist for sublist in temp_list if len(sublist) > 0]

        # Match the name we are looking for with its link
        html = [sublist for sublist in temp_list if sublist[0].text + ' ' + sublist[1].text == name][0][0]

        # Get the link and append it to a list
        soup = BeautifulSoup(str(html), 'html.parser')
        link = soup.a.get('href')
        ufcstats_list.append(link)
    except:
        continue
    
    sleep(3 + random.random())

100%|████████████████████████████████████████████████████████████████████████████████| 175/175 [12:43<00:00,  4.36s/it]


### Now that we got our links we can scrape the data

In [91]:
from UFC_functions import get_stats_ufcstats

ufcstats_df = pd.concat(list(map(get_stats_ufcstats, tqdm(ufcstats_list))), ignore_index = True)

100%|████████████████████████████████████████████████████████████████████████████████| 167/167 [11:14<00:00,  4.04s/it]


In [92]:
ufcstats_df

Unnamed: 0,name,height,weight,reach,stance,date_of_birth
0,Yana Santos,"5' 6""",135 lbs.,"68""",Orthodox,"Nov 11, 1989"
1,Blagoy Ivanov,"5' 11""",250 lbs.,"73""",Southpaw,"Oct 09, 1986"
2,Trevin Giles,"6' 0""",170 lbs.,"74""",Orthodox,"Aug 06, 1992"
3,Mana Martinez,"5' 10""",135 lbs.,"70""",Orthodox,"Mar 25, 1996"
4,Trevor Peek,"5' 9""",155 lbs.,"70""",Switch,"Jan 09, 1995"
...,...,...,...,...,...,...
162,Nazim Sadykhov,"5' 10""",155 lbs.,"69""",Southpaw,"May 16, 1994"
163,Juancamilo Ronderos,"5' 3""",125 lbs.,"64""",Southpaw,"Feb 17, 1995"
164,Kamaru Usman,"6' 0""",170 lbs.,"76""",Switch,"May 11, 1987"
165,Mike Malott,"6' 1""",170 lbs.,"73""",Orthodox,"Nov 07, 1991"


# Now we can combine our 3 dataframes

In [93]:
# We do outer merge to keep all information and combine the same columns

df = pd.merge(
        espn_df, pd.merge(ufc_df, ufcstats_df, how='outer', on='name'),
            how='outer', on='name'
        )

In [94]:
df

Unnamed: 0,name,country,division,height,weight,birth_date,stance_x,reach,Date,Opponent,Event,Res.,SDBL/A,SDHL/A,SDLL/A,TSL,TSA,SSL,SSA,TSL-TSA,KD,%BODY,%HEAD,%LEG,SCBL,SCBA,SCHL,SCHA,SCLL,SCLA,RV,SR,TDL,TDA,TDS,TK ACC,SGBL,SGBA,SGHL,SGHA,SGLL,SGLA,AD,ADTB,ADHG,ADTM,ADTS,SM,Decision,Rnd,Time,hometown,fighting_style,height_x,weight_x,reach_x,leg_reach,height_y,weight_y,reach_y,stance_y,date_of_birth
0,Kyle Nelson,Canada,Featherweight,"5' 11""",145 lbs,4/20/1991 (31),Switch,,"Feb 4, 2023",Dooho Choi,UFC Fight Night,D,6/9,8/38,3/7,39,80,20,58,48.75%,0,70%,24%,43%,1,1,2,3,0,0,0,0.000,5,10,0,50%,0,0,0,0,0,0,0,0,0,0,0,0,Draw,3,5:00,"Huntsville, Canada",Jiu-Jitsu,71.00,145.50,71.00,41.00,"5' 11""",145 lbs.,"71""",Switch,"Apr 20, 1991"
1,Kyle Nelson,Canada,Featherweight,"5' 11""",145 lbs,4/20/1991 (31),Switch,,"Jul 23, 2022",Jai Herbert,UFC Fight Night,L,7/9,6/23,15/15,55,77,41,62,71.43%,0,85%,44%,100%,4,4,9,11,0,0,0,0.000,0,4,0,0%,0,0,0,0,0,0,0,0,0,0,0,0,Decision - Unanimous,3,5:00,"Huntsville, Canada",Jiu-Jitsu,71.00,145.50,71.00,41.00,"5' 11""",145 lbs.,"71""",Switch,"Apr 20, 1991"
2,Kyle Nelson,Canada,Featherweight,"5' 11""",145 lbs,4/20/1991 (31),Switch,,"Sep 12, 2020",Billy Quarantillo,UFC Fight Night,L,10/17,30/75,3/3,62,120,56,114,51.67%,0,65%,44%,100%,3,3,7,13,0,0,0,0.000,0,4,0,0%,0,0,3,3,0,0,0,0,0,0,0,0,KO/TKO,3,0:07,"Huntsville, Canada",Jiu-Jitsu,71.00,145.50,71.00,41.00,"5' 11""",145 lbs.,"71""",Switch,"Apr 20, 1991"
3,Kyle Nelson,Canada,Featherweight,"5' 11""",145 lbs,4/20/1991 (31),Switch,,"Sep 21, 2019",Polo Reyes,UFC Fight Night,W,1/1,1/2,0/0,12,16,12,16,75.00%,0,100%,67%,0%,3,3,7,10,0,0,0,0.000,0,2,0,0%,0,0,0,0,0,0,0,0,0,0,0,0,KO/TKO,1,1:36,"Huntsville, Canada",Jiu-Jitsu,71.00,145.50,71.00,41.00,"5' 11""",145 lbs.,"71""",Switch,"Apr 20, 1991"
4,Kyle Nelson,Canada,Featherweight,"5' 11""",145 lbs,4/20/1991 (31),Switch,,"May 4, 2019",Matt Sayles,UFC Fight Night,L,8/9,16/77,0/0,32,103,26,93,31.07%,0,73%,22%,100%,0,2,2,5,0,0,0,0.000,2,10,0,20%,0,0,0,0,0,0,1,1,0,0,0,4,Submission,3,3:16,"Huntsville, Canada",Jiu-Jitsu,71.00,145.50,71.00,41.00,"5' 11""",145 lbs.,"71""",Switch,"Apr 20, 1991"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187,Gabriel Santos,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"State of Ceará, Brazil",MMA,69.00,145.50,70.00,,"5' 9""",145 lbs.,"70""",Orthodox,"Nov 28, 1996"
1188,Yi Zha,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Aba Sichuan, China",Striker,67.00,145.50,70.50,36.50,,,,,
1189,Shannon Ross,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Sydney, Australia",Freestyle,66.00,125.50,-0.02,,"5' 6""",125 lbs.,"66""",Switch,"May 12, 1989"
1190,Loik Radzhabov,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Tajikistan,Boxing,71.00,155.50,69.00,39.00,"5' 11""",155 lbs.,"69""",Orthodox,"Sep 17, 1990"


In [95]:
df.isnull().sum()

name                0
country            20
division           20
height             20
weight             20
birth_date         20
stance_x           20
reach             962
Date               20
Opponent           20
Event              20
Res.               20
SDBL/A             20
SDHL/A             20
SDLL/A             20
TSL                20
TSA                20
SSL                20
SSA                20
TSL-TSA            20
KD                 20
%BODY              20
%HEAD              20
%LEG               20
SCBL               20
SCBA               20
SCHL               20
SCHA               20
SCLL               20
SCLA               20
RV                 20
SR                 20
TDL                20
TDA                20
TDS                20
TK ACC             20
SGBL               22
SGBA               22
SGHL               22
SGHA               22
SGLL               22
SGLA               22
AD                 22
ADTB               22
ADHG               22
ADTM      

In [96]:
with open("D:/data_projects/mma/updates/not_clean_df.pickle", "wb") as file:
    pickle.dump(df, file)

In [97]:
with open("D:/data_projects/mma/updates/not_clean_df.pickle", "rb") as file:
    df = pickle.load(file)

# Data cleaning exactly like in the first notebook

In [98]:
# There are some '-', '--' values in our dataframe as there were on the sites which where scraped, we'll replace them with nan

df.replace('-', np.nan, inplace=True)
df.replace('--', np.nan, inplace=True)

In [99]:
df = df[~df['division'].isnull()] # Remove the one row with no information

# Change height from feet and inches to meters
df['height'] = (df['height'].str.replace('\'', '').str.split(' ').str[0].astype(float) * 0.3048 + 
                df['height'].str.replace('"', '').str.split(' ').str[1].astype(float) * 0.0254)

df['weight'] = df['weight'].str.split(' ').str[1] # Remove lbs from weight

# Keep only the birth year
df['birth_date'] = df['birth_date'].str.split('/').str[-1].str.split(' ').str[0].astype(float).round().astype(pd.Int64Dtype())

df['reach'] = df['reach'].str[:-1].astype(float) * 0.0254 # Convert reach from inches to meters

df['Date'] = pd.to_datetime(df['Date'], format='%m %d, %Y', infer_datetime_format=True) # Convert to year-month-day

In [100]:
df['TSL-TSA'] = df['TSL-TSA'].str[:-1] # Remove %
df.rename(columns={"TSL-TSA": "%TSL-TSA"}, inplace=True) # Rename the column

df['TK ACC'] = df['TK ACC'].str[:-1] # Remove %
df.rename(columns={"TK ACC": "%TK ACC"}, inplace=True) # Rename the column

df['%BODY'] = df['%BODY'].str[:-1] # Remove %
df['%HEAD'] = df['%HEAD'].str[:-1] # Remove %
df['%LEG'] = df['%LEG'].str[:-1] # Remove %

# Time to seconds
df['Time'] = df['Time'].str.split(':').str[0].astype(float) * 60 + df['Time'].str.split(':').str[1].astype(float) 

df['height_x'] = df['height_x'].astype(float) * 0.0254 # Inches to meters
df['reach_x'] = df['reach_x'].astype(float) * 0.0254 # Inches to meters
df['leg_reach'] = df['leg_reach'].astype(float) * 0.0254 # Inches to meters

# Change height to meters
df['height_y'] = (df['height_y'].str.replace('\'', '').str.split(' ').str[0].astype(float) * 0.3048 + 
                df['height_y'].str.replace('"', '').str.split(' ').str[1].astype(float) * 0.0254)

df['weight_y'] = df['weight_y'].str.split().str[0].astype(float) # Remove lbs.
df['reach_y'] = df['reach_y'].str.replace('"', '').astype(float) * 0.0254 # Remove '"' and convert to meters

 # Convert to year-month-day
df['date_of_birth'] = df['date_of_birth'].str.split().str[-1].astype(float).round().astype(pd.Int64Dtype())

In [101]:
# Fix the division column

weight_classes = [
            "Light Heavyweight", "Featherweight", "Bantamweight", "Flyweight",
            "Welterweight", "Women's Strawweight", "Women's Bantamweight", "Catchweight"
            "Middleweight", "Lightweight", "Heavyweight", "Women's Featherweight", "Women's Flyweight"]

df['division'] = df['division'].where(df['division'].isin(weight_classes), other=np.nan)

In [102]:
# Use combine_first to update null elements with values in the same location

df['height'] = (df['height'].combine_first(df['height_x'])
                            .combine_first(df['height_y'])) # Combine values of height, height_x, height_y

df['weight'] = (df['weight'].combine_first(df['weight_x'])
                            .combine_first(df['weight_y'])) # Combine values of weight, weight_x, weight_y

df['birth_date'] = df['birth_date'].combine_first(df['date_of_birth']) # Combine birth dates

df['stance_x'] = df['stance_x'].combine_first(df['stance_y']) # Combine stances
df.rename(columns={'stance_x': 'stance'}, inplace=True) # Rename the column

df['reach'] = (df['reach'].combine_first(df['reach_x'])
                          .combine_first(df['reach_y'])) # Combine reaches



# Drop the collumns which we dont need after combining them

df.drop(['height_x', 'height_y', 'weight_x', 'weight_y', 
         'date_of_birth', 'stance_y', 'reach_x', 'reach_y'], axis=1, inplace=True)

In [103]:
# Look at the documentation for explanation

df['SDBL'] = df['SDBL/A'].str.split('/').str[0].astype(float).round().astype(pd.Int64Dtype())
df['SDBA'] = df['SDBL/A'].str.split('/').str[1].astype(float).round().astype(pd.Int64Dtype())
df['SDBL/A'] = df['SDBL'] / df['SDBA']

df['SDHL'] = df['SDHL/A'].str.split('/').str[0].astype(float).round().astype(pd.Int64Dtype())
df['SDHA'] = df['SDHL/A'].str.split('/').str[1].astype(float).round().astype(pd.Int64Dtype())
df['SDHL/A'] = df['SDHL'] / df['SDHA']

df['SDLL'] = df['SDLL/A'].str.split('/').str[0].astype(float).round().astype(pd.Int64Dtype())
df['SDLA'] = df['SDLL/A'].str.split('/').str[1].astype(float).round().astype(pd.Int64Dtype())
df['SDLL/A'] = df['SDLL'] / df['SDLA'] 

In [104]:
list_floats = ['weight', '%TSL-TSA', '%BODY' , '%HEAD', '%LEG', '%TK ACC']

list_integers = ['TSL', 'TSA', 'SSL', 'SSA', 'KD', 'SCBL', 'SCBA', 'SCHL', 'SCHA', 'SCLL',
                 'SCLA', 'RV', 'SR', 'TDL', 'TDA', 'TDS', 'SGBL', 'SGBA', 'SGHL', 'SGHA',
                 'SGLL', 'SGLA', 'AD', 'ADTB', 'ADHG', 'ADTM', 'ADTS', 'SM', 'Rnd']

In [105]:
# Convert floats
df[list_floats] = df[list_floats].astype(float)

# Convert integers
df[list_integers] = df[list_integers].astype(float).round().astype(pd.Int64Dtype())

In [106]:
df.isnull().sum()

name                0
country             0
division           96
height              0
weight              0
birth_date          0
stance              0
reach               0
Date                0
Opponent            0
Event               0
Res.                0
SDBL/A             13
SDHL/A             13
SDLL/A             13
TSL                13
TSA                13
SSL                13
SSA                13
%TSL-TSA           22
KD                 13
%BODY              13
%HEAD              13
%LEG               13
SCBL               13
SCBA               13
SCHL               13
SCHA               13
SCLL               13
SCLA               13
RV                 13
SR                 13
TDL                13
TDA                13
TDS                13
%TK ACC            13
SGBL               15
SGBA               15
SGHL               15
SGHA               15
SGLL               15
SGLA               15
AD                 15
ADTB               15
ADHG               15
ADTM      

In [107]:
with open("D:/data_projects/mma/updates/clean_df.pickle", "wb") as file:
    pickle.dump(df, file)

In [3]:
with open("D:/data_projects/mma/updates/clean_df.pickle", "rb") as file:
    df = pickle.load(file)

In [4]:
# Create an unique ID
df['ID'] = df['Date'].astype(str) + '__' + df['name'] +  '__' + df['Opponent']

# We want the ID to be the first column in our dataframe
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]

df = df[cols]

# Replace nans with None
df = df.replace({pd.NaT: None})

# Drop duplicates in ID
df.drop_duplicates(subset='ID', keep="first", inplace=True)

In [5]:
df

Unnamed: 0,ID,name,country,division,height,weight,birth_date,stance,reach,Date,Opponent,Event,Res.,SDBL/A,SDHL/A,SDLL/A,TSL,TSA,SSL,SSA,%TSL-TSA,KD,%BODY,%HEAD,%LEG,SCBL,SCBA,SCHL,SCHA,SCLL,SCLA,RV,SR,TDL,TDA,TDS,%TK ACC,SGBL,SGBA,SGHL,SGHA,SGLL,SGLA,AD,ADTB,ADHG,ADTM,ADTS,SM,Decision,Rnd,Time,hometown,fighting_style,leg_reach,SDBL,SDBA,SDHL,SDHA,SDLL,SDLA
0,2023-02-04__Kyle Nelson__Dooho Choi,Kyle Nelson,Canada,Featherweight,1.8034,145.0,1991,Switch,1.8034,2023-02-04,Dooho Choi,UFC Fight Night,D,0.666667,0.210526,0.428571,39,80,20,58,48.75,0,70.0,24.0,43.0,1,1,2,3,0,0,0,0,5,10,0,50.0,0,0,0,0,0,0,0,0,0,0,0,0,Draw,3,300.0,"Huntsville, Canada",Jiu-Jitsu,1.0414,6,9,8,38,3,7
1,2022-07-23__Kyle Nelson__Jai Herbert,Kyle Nelson,Canada,Featherweight,1.8034,145.0,1991,Switch,1.8034,2022-07-23,Jai Herbert,UFC Fight Night,L,0.777778,0.26087,1.0,55,77,41,62,71.43,0,85.0,44.0,100.0,4,4,9,11,0,0,0,0,0,4,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,Decision - Unanimous,3,300.0,"Huntsville, Canada",Jiu-Jitsu,1.0414,7,9,6,23,15,15
2,2020-09-12__Kyle Nelson__Billy Quarantillo,Kyle Nelson,Canada,Featherweight,1.8034,145.0,1991,Switch,1.8034,2020-09-12,Billy Quarantillo,UFC Fight Night,L,0.588235,0.4,1.0,62,120,56,114,51.67,0,65.0,44.0,100.0,3,3,7,13,0,0,0,0,0,4,0,0.0,0,0,3,3,0,0,0,0,0,0,0,0,KO/TKO,3,7.0,"Huntsville, Canada",Jiu-Jitsu,1.0414,10,17,30,75,3,3
3,2019-09-21__Kyle Nelson__Polo Reyes,Kyle Nelson,Canada,Featherweight,1.8034,145.0,1991,Switch,1.8034,2019-09-21,Polo Reyes,UFC Fight Night,W,1.0,0.5,,12,16,12,16,75.00,0,100.0,67.0,0.0,3,3,7,10,0,0,0,0,0,2,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,KO/TKO,1,96.0,"Huntsville, Canada",Jiu-Jitsu,1.0414,1,1,1,2,0,0
4,2019-05-04__Kyle Nelson__Matt Sayles,Kyle Nelson,Canada,Featherweight,1.8034,145.0,1991,Switch,1.8034,2019-05-04,Matt Sayles,UFC Fight Night,L,0.888889,0.207792,,32,103,26,93,31.07,0,73.0,22.0,100.0,0,2,2,5,0,0,0,0,2,10,0,20.0,0,0,0,0,0,0,1,1,0,0,0,4,Submission,3,196.0,"Huntsville, Canada",Jiu-Jitsu,1.0414,8,9,16,77,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1167,2020-09-19__Mayra Bueno Silva__Mara Romero Bor...,Mayra Bueno Silva,Brazil,Women's Flyweight,1.6764,136.0,1991,Orthodox,1.6891,2020-09-19,Mara Romero Borella,UFC Fight Night,W,0.0,0.2,1.0,3,8,2,7,37.50,0,0.0,20.0,100.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,Submission,1,149.0,"Uberlândia, Brazil",Striker,0.9906,0,1,1,5,1,1
1168,2020-03-14__Mayra Bueno Silva__Maryna Moroz,Mayra Bueno Silva,Brazil,Women's Flyweight,1.6764,136.0,1991,Orthodox,1.6891,2020-03-14,Maryna Moroz,UFC Fight Night,L,0.888889,0.425,0.71875,92,150,88,146,61.33,0,91.0,49.0,72.0,4,4,11,12,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,Decision - Unanimous,3,300.0,"Uberlândia, Brazil",Striker,0.9906,16,18,34,80,23,32
1169,2018-09-22__Mayra Bueno Silva__Gillian Robertson,Mayra Bueno Silva,Brazil,Women's Flyweight,1.6764,136.0,1991,Orthodox,1.6891,2018-09-22,Gillian Robertson,UFC Fight Night,W,1.0,0.15,,12,33,11,32,36.36,0,100.0,30.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0.0,0,0,6,9,0,0,0,0,0,0,0,1,Submission,1,295.0,"Uberlândia, Brazil",Striker,0.9906,2,2,3,20,0,0
1170,2023-02-04__Toshiomi Kazama__Rinya Nakamura,Toshiomi Kazama,Japan,Bantamweight,1.7018,136.0,1997,Orthodox,1.7526,2023-02-04,Rinya Nakamura,UFC Fight Night,L,,0.4,,4,10,4,10,40.00,0,0.0,40.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,KO/TKO,1,33.0,Japan,Brazilian Jiu-Jitsu,0.9779,0,0,4,10,0,0


In [6]:
# 3 months ago date

past_date = datetime.datetime.now() - datetime.timedelta(3 * 30)
past_date = past_date.strftime('%Y-%m-%d')

In [7]:
# Query data from the past 3 months

conn = pg2.connect(host="localhost", database='PostgreSQL_mma', user='postgres', password='password')


data = pd.read_sql_query(f"""
SELECT *
FROM mma
WHERE Date > '{past_date}'

""", conn)

conn.close()

data

Unnamed: 0,id,name,country,division,height,weight,birth_date,stance,reach,date,opponent,event,res,sdbl_a,sdhl_a,sdll_a,tsl,tsa,ssl,ssa,tsl_tsa_percent,kd,body_percent,head_percent,leg_percent,scbl,scba,schl,scha,scll,scla,rv,sr,tdl,tda,tds,tk_acc_percent,sgbl,sgba,sghl,sgha,sgll,sgla,ad,adtb,adhg,adtm,adts,sm,decision,rnd,time,hometown,fighting_style,leg_reach,sdbl,sdba,sdhl,sdha,sdll,sdla
0,2023-02-04__Kyle Nelson__Dooho Choi,Kyle Nelson,Canada,Featherweight,1.8034,145.0,1991,Switch,1.8034,2023-02-04,Dooho Choi,UFC Fight Night,D,0.666667,0.210526,0.428571,39,80,20,58,48.75,0,70.0,24.0,43.0,1,1,2,3,0,0,0,0,5,10,0,50.0,0,0,0,0,0,0,0,0,0,0,0,0,Draw,3,300.0,"Huntsville, Canada",Jiu-Jitsu,1.0414,6,9,8,38,3,7
1,2023-01-14__Raoni Barcelos__Umar Nurmagomedov,Raoni Barcelos,Brazil,Bantamweight,1.7018,135.0,1987,Orthodox,1.7018,2023-01-14,Umar Nurmagomedov,UFC Fight Night,L,1.0,0.214286,1.0,14,25,13,24,56.0,0,100.0,35.0,100.0,0,0,3,3,0,0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,KO/TKO,1,280.0,"State of Rio de Janeiro, Brazil",Jiu-Jitsu,0.9906,5,5,3,14,2,2
2,2023-01-14__Raquel Pennington__Ketlen Vieira,Raquel Pennington,USA,Women's Bantamweight,1.7018,135.0,1988,Orthodox,1.7145,2023-01-14,Ketlen Vieira,UFC Fight Night,W,0.916667,0.330275,0.923077,104,184,81,159,56.52,0,91.0,33.0,92.0,21,23,1,2,0,0,0,0,0,2,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,Decision - Split,3,300.0,"Colorado Springs, United States",Freestyle,0.9398,11,12,36,109,12,13
3,2023-01-21__Gabriel Bonfim__Mounir Lazzez,Gabriel Bonfim,Brazil,Welterweight,1.8542,170.0,1997,Orthodox,1.8415,2023-01-21,Mounir Lazzez,UFC 283,W,1.0,0.333333,0.5,9,22,9,22,40.91,0,100.0,33.0,50.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,Submission,1,49.0,"Brasilia, Brazil",Striker,1.0287,2,2,6,18,1,2
4,2023-02-04__JeongYeong Lee__Zha Yi,JeongYeong Lee,Korea,Featherweight,1.778,145.0,1995,Orthodox,1.8542,2023-02-04,Zha Yi,UFC Fight Night,W,1.0,0.295455,,66,105,25,57,62.86,0,100.0,36.0,0.0,2,2,5,6,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,Decision - Split,3,300.0,,,,5,5,13,44,0,0
5,2023-01-21__Jessica Andrade__Lauren Murphy,Jessica Andrade,Brazil,Women's Strawweight,1.5494,125.0,1991,Orthodox,1.5748,2023-01-21,Lauren Murphy,UFC 283,W,0.809524,0.539326,0.94,237,375,231,369,63.2,0,85.0,55.0,94.0,5,5,17,25,0,0,0,0,1,1,0,100.0,1,1,0,0,0,0,0,0,0,0,0,0,Decision - Unanimous,3,300.0,"Umuarama, Brazil",Muay Thai,0.889,17,21,144,267,47,50
6,2023-01-21__Ihor Potieria__Mauricio Rua,Ihor Potieria,Ukraine,Light Heavyweight,1.905,204.0,1996,Southpaw,1.905,2023-01-21,Mauricio Rua,UFC 283,W,0.666667,0.346154,1.0,27,47,21,41,57.45,1,75.0,46.0,100.0,1,1,4,5,0,0,0,0,0,2,0,0.0,0,0,3,4,0,0,0,0,0,0,0,0,KO/TKO,1,245.0,"Kyiv, Ukraine",MMA,1.1176,2,3,9,26,2,2
7,2023-01-14__Sean Strickland__Nassourdine Imavov,Sean Strickland,USA,,1.8542,204.0,1991,Orthodox,1.9304,2023-01-14,Nassourdine Imavov,UFC Fight Night,W,0.787234,0.377581,1.0,194,420,182,405,46.19,0,79.0,39.0,100.0,1,1,6,8,0,0,0,0,1,1,0,100.0,0,0,0,0,0,0,0,0,0,0,0,0,Decision - Unanimous,5,300.0,United States,MMA,1.0541,37,47,128,339,10,10
8,2023-01-21__Melquizael Costa__Thiago Moises,Melquizael Costa,Brazil,Lightweight,1.778,155.0,1996,Southpaw,1.8034,2023-01-21,Thiago Moises,UFC 283,L,0.533333,0.117647,0.846154,43,71,23,50,60.56,0,56.0,10.0,85.0,2,3,0,2,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,Submission,2,245.0,"Porto de Moz, Brazil",MMA,1.0287,8,15,2,17,11,13
9,2023-01-14__Nick Fiore__Mateusz Rebecki,Nick Fiore,USA,Lightweight,1.8034,155.0,1997,Orthodox,1.8288,2023-01-14,Mateusz Rebecki,UFC Fight Night,L,0.454545,0.308824,0.5,52,113,34,93,46.02,0,54.0,33.0,50.0,2,2,3,7,0,0,0,0,1,3,0,33.0,0,0,1,1,0,0,0,0,0,0,0,0,Decision - Unanimous,3,300.0,"Lowell, United States",Brazilian Jiu-Jitsu,,5,11,21,68,2,4


In [None]:
# We want the ID to not be in the database (new fight), but because we queried the past 3 months,
# we need to exclude older fights from the dataframe below
# So in the past 3 months these are the fights done that are not in the database
# Keep in mind the data we want are about 2 months old right now

df = df[(~df['ID'].isin(data['id'])) & (df['Date'] > past_date)]

### Now we want to insert our new data in the database, the code below works even for empty dataframes

In [32]:
from UFC_functions import insert_data

# Convert the DataFrame to a list of tuples
data_to_db = [tuple(x) for x in df.to_numpy()]

# %s ,%s ,%s ,%s , etc.. we use this in our query below
values = ('%s ,'*len(df.columns))[:-2] 


# Connect to the PostgreSQL database
conn = pg2.connect(host="localhost", database='PostgreSQL_mma', user='postgres', password='password')

# Create a cursor object to perform database operations
cur = conn.cursor()

# Insert data to the table
insert_data(cur=cur, data=data_to_db, values=values)

# Commit the transaction
conn.commit()

# Close the cursor and database connection
cur.close()
conn.close()

In [33]:
# Check if everything worked

conn = pg2.connect(host="localhost", database='PostgreSQL_mma', user='postgres', password='password')


df = pd.read_sql_query(f"""
SELECT DISTINCT DATE
FROM mma
WHERE Date > '{past_date}'
ORDER BY DATE DESC
LIMIT 5

""", conn)

conn.close()

df

Unnamed: 0,date
0,2023-03-25
1,2023-03-18
2,2023-03-11
3,2023-03-04
4,2023-02-25


## Next based on this notebook we should write a .py file which we run on once per week