In [1]:
import numpy as np
import pandas as pd

import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from tqdm import tqdm
from time import sleep

import random
import pickle
import datetime

import psycopg2 as pg2

from UFC_functions import get_fighters_df
from UFC_functions import fixing_collumns
from UFC_functions import get_stats_ufc
from UFC_functions import get_stats_ufcstats
from UFC_functions import insert_data

#############################################################################################################################

'''
We want to open the page, deny cookies, click past events and get what events we have,
We dont need to interact with the load more button.

'''

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # Open chrome

driver.get("https://www.ufc.com/events") # Go to the webpage

driver.find_element(By.XPATH, '//*[@id="onetrust-reject-all-handler"]').click() # Deny cookies

sleep(1 + random.random())

driver.find_element(By.XPATH, '//*[@id="block-mainpagecontent"]/div/div/div[5]/div/ul/li[2]/a/strong').click() # Past events

# Get the page source and create a BeautifulSoup object
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")

# Find all anchor tags with a "Recap" button
recap_buttons = soup.find_all("a", class_="e-button--white")
recap_links = [button.get("href") for button in recap_buttons if "Recap" in button.get_text()]
recap_links = ['https://www.ufc.com' + link for link in recap_links]

driver.close()

#############################################################################################################################


driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # Open chrome
driver.get(recap_links[0]) # Go to the webpage
driver.find_element(By.XPATH, '//*[@id="onetrust-reject-all-handler"]').click() # Deny cookies
sleep(1 + random.random())

# Get the page source and create a BeautifulSoup object
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")


# Returns a elements which have '/athlete/' in them and also span elements for given and family name
a_elements = soup.select('a[href*="/athlete/"] span.c-listing-fight__corner-given-name, a[href*="/athlete/"] span.c-listing-fight__corner-family-name')

# Given and family name span elements
temp_list = [elem.text for elem in a_elements]


'''
The athletes which have more than one family or given name have them in the same span
So we can use the line below to get pairs of Given and Family name

'''
# Use a list comprehension to concatenate each pair of elements into a single string
fought_recently = [temp_list[i] + ' ' + temp_list[i+1] for i in range(0, len(temp_list), 2)]
    
driver.close()

#############################################################################################################################

with open("D:/data_projects/mma/updates/athletes_names_list.pickle", "rb") as file:
    names = pickle.load(file)
    
    
# Names that are in the fought recently list but not in the fought past list

new_names = list(set(fought_recently).difference(names))

if len(new_names) != 0: # If the list is not empty
    names += new_names # Append the new names
    
    # Overwrite the updated names list to the updates folder
    with open("D:/data_projects/mma/updates/athletes_names_list.pickle", "wb") as file:
        pickle.dump(names, file)
    
    # Espn  
    try:
        new_fighters_espn_urls = []

        for name in tqdm(new_names):

            # Go to the url based on alphabet letter
            main_link = 'http://www.espn.com/mma/fighters?search=' + name.split()[-1].lower()[0]
            headers = {'User-agent': ''}
            response = requests.get(main_link, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')


            temp_list = [url for url in soup.find_all('a') if url is not None]
            temp_list = ['https://www.espn.com' + url.get('href') for url in temp_list if '/mma/fighter/_/id/' in url.get('href')]

            temp_names = ['-'.join(name.lower().split()) for name in new_names]

            temp_links = [link for link in temp_list if any(name in link for name in temp_names)]

            new_fighters_espn_urls += temp_links
    
        with open("D:/data_projects/mma/updates/athletes_espn_url.pickle", "rb") as file:
            temp_espn_url = pickle.load(file)
            
        temp_espn_url += new_fighters_espn_urls
        
        with open("D:/data_projects/mma/updates/athletes_espn_url.pickle", "wb") as file:
            pickle.dump(temp_espn_url, file)
    except:
        pass 
    
#############################################################################################################################

# Create links

# Espn has the non static data which are the ones we will scrape frequently

with open("D:/data_projects/mma/updates/athletes_espn_url.pickle", "rb") as file:
    espn_url = pickle.load(file)
    
temp_list = ['-'.join(name.lower().split()) for name in fought_recently]

recent_espn_url = [url for url in espn_url if any(name in url for name in temp_list)]

espn_df = pd.concat(list(map(get_fighters_df, recent_espn_url)))
espn_df = espn_df[espn_df['Event'].str.contains('UFC')]

#############################################################################################################################

# Fix excessive columns

excessive_columns = [col for col in espn_df.columns if (len(col.split('_')) > 1) and (len(col.split('_')[1]) == 1)]

# If more columns got generated than we wanted
if len(excessive_columns) != 0 :
        
    for col_to_fix in excessive_columns:
    
        espn_df[col_to_fix.split('_')[0]] = fixing_collumns(col_to_fix.split('_')[0], col_to_fix, espn_df)

    # Drop the columns we dont need anymore
    espn_df.drop(excessive_columns, axis=1, inplace = True)
    
#############################################################################################################################

# UFC site

ufc_url_list = ['https://www.ufc.com/athlete/' + '-'.join(name.lower().split()) for name in fought_recently]

ufc_df = pd.concat(list(map(get_stats_ufc, tqdm(ufc_url_list))), ignore_index = True)

#############################################################################################################################

# UFCstats site

ufcstats_list = []

for name in fought_recently:

    # Go to the url based on alphabet letter
    main_link = 'http://ufcstats.com/statistics/fighters?char=' + name.split()[-1].lower()[0] + '&page=all'
    headers = {'User-agent': ''}
    response = requests.get(main_link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    try:
        # Find the rows which have names and links
        temp_list = ([row.find_all('a', {'class': 'b-link b-link_style_black'}) 
                      for row in soup.find_all('tr', {'class': 'b-statistics__table-row'})])
        # And are not empty
        temp_list = [sublist for sublist in temp_list if len(sublist) > 0]

        # Match the name we are looking for with its link
        html = [sublist for sublist in temp_list if sublist[0].text + ' ' + sublist[1].text == name][0][0]

        # Get the link and append it to a list
        soup = BeautifulSoup(str(html), 'html.parser')
        link = soup.a.get('href')
        ufcstats_list.append(link)
    except:
        continue
    
    sleep(3 + random.random())


ufcstats_df = pd.concat(list(map(get_stats_ufcstats, ufcstats_list)), ignore_index = True)

#############################################################################################################################

# Combine dataframes

# We do outer merge to keep all information and combine the same columns
df = pd.merge(
        espn_df, pd.merge(ufc_df, ufcstats_df, how='outer', on='name'),
            how='outer', on='name'
        )

#############################################################################################################################

# Cleaning

# There are some '-', '--' values in our dataframe as there were on the sites which where scraped, we'll replace them with nan

df.replace('-', np.nan, inplace=True)
df.replace('--', np.nan, inplace=True)

df = df[~df['division'].isnull()] # Remove the one row with no information

# Change height from feet and inches to meters
df['height'] = (df['height'].str.replace('\'', '').str.split(' ').str[0].astype(float) * 0.3048 + 
                df['height'].str.replace('"', '').str.split(' ').str[1].astype(float) * 0.0254)

df['weight'] = df['weight'].str.split(' ').str[1] # Remove lbs from weight

# Keep only the birth year
df['birth_date'] = df['birth_date'].str.split('/').str[-1].str.split(' ').str[0].astype(float).round().astype(pd.Int64Dtype())

df['reach'] = df['reach'].str[:-1].astype(float) * 0.0254 # Convert reach from inches to meters

df['Date'] = pd.to_datetime(df['Date'], format='%m %d, %Y', infer_datetime_format=True) # Convert to year-month-day


df['TSL-TSA'] = df['TSL-TSA'].str[:-1] # Remove %
df.rename(columns={"TSL-TSA": "%TSL-TSA"}, inplace=True) # Rename the column

df['TK ACC'] = df['TK ACC'].str[:-1] # Remove %
df.rename(columns={"TK ACC": "%TK ACC"}, inplace=True) # Rename the column

df['%BODY'] = df['%BODY'].str[:-1] # Remove %
df['%HEAD'] = df['%HEAD'].str[:-1] # Remove %
df['%LEG'] = df['%LEG'].str[:-1] # Remove %

# Time to seconds
df['Time'] = df['Time'].str.split(':').str[0].astype(float) * 60 + df['Time'].str.split(':').str[1].astype(float) 

df['height_x'] = df['height_x'].astype(float) * 0.0254 # Inches to meters
df['reach_x'] = df['reach_x'].astype(float) * 0.0254 # Inches to meters
df['leg_reach'] = df['leg_reach'].astype(float) * 0.0254 # Inches to meters

# Change height to meters
df['height_y'] = (df['height_y'].str.replace('\'', '').str.split(' ').str[0].astype(float) * 0.3048 + 
                df['height_y'].str.replace('"', '').str.split(' ').str[1].astype(float) * 0.0254)

df['weight_y'] = df['weight_y'].str.split().str[0].astype(float) # Remove lbs.
df['reach_y'] = df['reach_y'].str.replace('"', '').astype(float) * 0.0254 # Remove '"' and convert to meters

 # Convert to year-month-day
df['date_of_birth'] = df['date_of_birth'].str.split().str[-1].astype(float).round().astype(pd.Int64Dtype())


# Fix the division column

weight_classes = [
            "Light Heavyweight", "Featherweight", "Bantamweight", "Flyweight",
            "Welterweight", "Women's Strawweight", "Women's Bantamweight", "Catchweight"
            "Middleweight", "Lightweight", "Heavyweight", "Women's Featherweight", "Women's Flyweight"]

df['division'] = df['division'].where(df['division'].isin(weight_classes), other=np.nan)


# Use combine_first to update null elements with values in the same location

df['height'] = (df['height'].combine_first(df['height_x'])
                            .combine_first(df['height_y'])) # Combine values of height, height_x, height_y

df['weight'] = (df['weight'].combine_first(df['weight_x'])
                            .combine_first(df['weight_y'])) # Combine values of weight, weight_x, weight_y

df['birth_date'] = df['birth_date'].combine_first(df['date_of_birth']) # Combine birth dates

df['stance_x'] = df['stance_x'].combine_first(df['stance_y']) # Combine stances
df.rename(columns={'stance_x': 'stance'}, inplace=True) # Rename the column

df['reach'] = (df['reach'].combine_first(df['reach_x'])
                          .combine_first(df['reach_y'])) # Combine reaches



# Drop the collumns which we dont need after combining them

df.drop(['height_x', 'height_y', 'weight_x', 'weight_y', 
         'date_of_birth', 'stance_y', 'reach_x', 'reach_y'], axis=1, inplace=True)


# Look at the documentation for explanation

df['SDBL'] = df['SDBL/A'].str.split('/').str[0].astype(float).round().astype(pd.Int64Dtype())
df['SDBA'] = df['SDBL/A'].str.split('/').str[1].astype(float).round().astype(pd.Int64Dtype())
df['SDBL/A'] = df['SDBL'] / df['SDBA']

df['SDHL'] = df['SDHL/A'].str.split('/').str[0].astype(float).round().astype(pd.Int64Dtype())
df['SDHA'] = df['SDHL/A'].str.split('/').str[1].astype(float).round().astype(pd.Int64Dtype())
df['SDHL/A'] = df['SDHL'] / df['SDHA']

df['SDLL'] = df['SDLL/A'].str.split('/').str[0].astype(float).round().astype(pd.Int64Dtype())
df['SDLA'] = df['SDLL/A'].str.split('/').str[1].astype(float).round().astype(pd.Int64Dtype())
df['SDLL/A'] = df['SDLL'] / df['SDLA'] 


list_floats = ['weight', '%TSL-TSA', '%BODY' , '%HEAD', '%LEG', '%TK ACC']

list_integers = ['TSL', 'TSA', 'SSL', 'SSA', 'KD', 'SCBL', 'SCBA', 'SCHL', 'SCHA', 'SCLL',
                 'SCLA', 'RV', 'SR', 'TDL', 'TDA', 'TDS', 'SGBL', 'SGBA', 'SGHL', 'SGHA',
                 'SGLL', 'SGLA', 'AD', 'ADTB', 'ADHG', 'ADTM', 'ADTS', 'SM', 'Rnd']

# Convert floats
df[list_floats] = df[list_floats].astype(float)

# Convert integers
df[list_integers] = df[list_integers].astype(float).round().astype(pd.Int64Dtype())

#############################################################################################################################

# Create an unique ID
df['ID'] = df['Date'].astype(str) + '__' + df['name'] +  '__' + df['Opponent']

# We want the ID to be the first column in our dataframe
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]

df = df[cols]

# Replace nans with None
df = df.replace({pd.NaT: None})

# Drop duplicates in ID
df.drop_duplicates(subset='ID', keep="first", inplace=True)

#############################################################################################################################

# 3 months ago date

past_date = datetime.datetime.now() - datetime.timedelta(3 * 30)
past_date = past_date.strftime('%Y-%m-%d')


# Query data from the past 3 months

conn = pg2.connect(host="localhost", database='PostgreSQL_mma', user='postgres', password='password')


data = pd.read_sql_query(f"""
SELECT *
FROM mma
WHERE Date > '{past_date}'

""", conn)

conn.close()


# We want the ID to not be in the database (new fight), but because we queried the past 3 months,
# we need to exclude older fights from the dataframe below
# So in the past 3 months these are the fights done that are not in the database
# Keep in mind the data we want are about 2 months old right now

df = df[(~df['ID'].isin(data['id'])) & (df['Date'] > past_date)]

#############################################################################################################################

# Insert data in the database

# Convert the DataFrame to a list of tuples
data_to_db = [tuple(x) for x in df.to_numpy()]

# %s ,%s ,%s ,%s , etc.. we use this in our query below
values = ('%s ,'*len(df.columns))[:-2] 


# Connect to the PostgreSQL database
conn = pg2.connect(host="localhost", database='PostgreSQL_mma', user='postgres', password='password')

# Create a cursor object to perform database operations
cur = conn.cursor()

# Insert data to the table
insert_data(cur=cur, data=data_to_db, values=values)

# Commit the transaction
conn.commit()

# Close the cursor and database connection
cur.close()
conn.close()

#############################################################################################################################

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.80M/6.80M [00:04<00:00, 1.60MB/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.27s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [01:51<00:00,  4.27s/it]


InvalidTextRepresentation: invalid input syntax for type integer: "Decision - Unanimous"
LINE 64: ...ight' ,'L' ,0 ,0 ,1 ,1 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0.0 ,'Decision ...
                                                              ^
