# Data Wrangling

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
ufc_dataset = pd.read_csv(r"C:\Everything related to Python\01 Python Project Datasets\Ultimate UFC Dataset (2010 - 2024) (TRIMMED) v2.csv")

## Data Transformation and Cleaning

Transform the dataset to show every row as a unique fighter.
Separate dataset into two tables, concanate together, then rename columns to relevant/accessible substitutes

In [3]:
red_fighters = ufc_dataset[['BlueFighter', 'Date', 'WeightClass', 'Gender', 'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws', 'BlueAvgSigStrLanded', 'BlueAvgSigStrPct', 'BlueAvgSubAtt',
'BlueAvgTDLanded', 'BlueAvgTDPct', 'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought', 'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority', 'BlueWinsByDecisionSplit',	
'BlueWinsByDecisionUnanimous', 'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage', 'BlueWins', 'BlueStance',	'BlueHeightCms', 'BlueReachCms', 'BlueAge']]

blue_fighters = ufc_dataset[['RedFighter', 'Date', 'WeightClass', 'Gender', 'RedCurrentLoseStreak', 'RedCurrentWinStreak', 'RedDraws', 'RedAvgSigStrLanded', 'RedAvgSigStrPct', 'RedAvgSubAtt',
'RedAvgTDLanded', 'RedAvgTDPct', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought', 'RedTotalTitleBouts', 'RedWinsByDecisionMajority', 'RedWinsByDecisionSplit', 
'RedWinsByDecisionUnanimous', 'RedWinsByKO', 'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins', 'RedStance', 'RedHeightCms', 'RedReachCms', 'RedAge']]

In [4]:
common_columns = list(range(1, 28))

red_fighters.columns = common_columns
blue_fighters.columns = common_columns

fighters_raw_data = pd.concat([red_fighters, blue_fighters], axis = 0)

In [5]:
fighters_raw_data = fighters_raw_data.rename({
1: 'name', 
2: 'date', 
3: 'weight_class', 
4: 'gender', 
5: 'current_lose_streak', 
6: 'current_win_streak', 
7: 'draws', 
8: 'avg_sig_str_landed', 
9: 'avg_sig_str_pct', 
10: 'avg_sub_att', 
11: 'avg_td_landed', 
12: 'avg_td_pct', 
13: 'longest_win_streak', 
14: 'losses', 
15: 'total_rounds_fought',    
16: 'total_title_bouts', 
17: 'wins_by_decision_majority', 
18: 'wins_by_decision_split', 
19: 'wins_by_decision_unanimous', 
20: 'wins_by_ko', 
21: 'wins_by_submission', 
22: 'wins_by_tko_doctor_stoppage', 
23: 'wins', 
24: 'stance', 
25: 'height_cms', 
26: 'reach_cms', 
27: 'age' 
}, 
axis = 'columns')

Rearrange column order for ease of use

In [6]:
moving_columns = ['age', 'stance', 'height_cms', 'reach_cms']
non_moving_columns = [col for col in fighters_raw_data.columns if col not in moving_columns]
new_order = [non_moving_columns[0]] + moving_columns + non_moving_columns[1:]
fighters_raw_data = fighters_raw_data[new_order]

Convert "date" column to conventional format and ensure data-type is datetime64

In [7]:
fighters_raw_data['date'] = pd.to_datetime(fighters_raw_data['date'])

Reset index, get only one unique fighter per column, and make sure all fighter values are up to date (before most recent fight as of 12/6/24).
The main dataset is now being renamed to "roster"

In [8]:
fighters_raw_data = fighters_raw_data.reset_index(drop=True)
newest_fighter_info = fighters_raw_data.groupby('name')['date'].idxmax()
roster = fighters_raw_data.loc[newest_fighter_info]

Create new columns 'wins_by_decision', 'total_bouts', and 'weight_class_lbs'

In [9]:
roster['wins_by_decision'] = roster['wins_by_decision_majority'] + roster['wins_by_decision_split'] + roster['wins_by_decision_unanimous']

roster['total_bouts'] = roster['draws'] + roster['losses'] + roster['wins']

reassigned_weights = {
    'Flyweight': 125,
    'Bantamweight': 135,
    'Featherweight': 145,
    'Lightweight': 155,
    'Welterweight': 170,
    'Middleweight': 185,
    'Light Heavyweight': 205,
    'Heavyweight': 265,
    "Women's Strawweight": 115,
    "Women's Flyweight": 125,
    "Women's Bantamweight": 135,
    "Women's Featherweight": 145
}
    
roster['weight_class_lbs'] = roster['weight_class'].map(reassigned_weights)

Save current dataset as CSV to preserve incase needed, and recall newly made CSV into workbook

In [10]:
roster.to_csv(r"C:\Everything related to Python\01 Python Project Datasets\roster.csv", index=False)

In [11]:
roster = pd.read_csv(r"C:\Everything related to Python\01 Python Project Datasets\roster.csv")

Create a subset of roster titled "superstars", which will include all recognized superstars into their own dataset for further analysis

In [12]:
superstar_fighters = [
'Conor McGregor',
'Khabib Nurmagomedov',
'Jon Jones',
'Alex Pereira',
'Israel Adesanya',
"Sean O'Malley",
'Max Holloway',
'Ronda Rousey',
'Islam Makhachev',
'Charles Oliveira',
'Jorge Masvidal',
'Stipe Miocic',
'Francis Ngannou',
'Dustin Poirier',
'Amanda Nunes',
'Valentina Shevchenko',
'Tony Ferguson',
'Nate Diaz',
'Ilia Topuria',
'Khamzat Chimaev',
'Alexander Volkanovski',
'Paddy Pimblett',
'Daniel Cormier']

superstars = roster[roster['name'].isin(superstar_fighters)]
superstars = superstars.reset_index(drop = True)