In [1]:
import pandas as pd
from unidecode import unidecode

# reading the kaggle dataset
kaggle_data = pd.read_csv('../ufcdata/data.csv')

# reading the elevations dataset(scraped)
elevations_data = pd.read_csv('../elevations/locations_elevation_updated.csv')

In [2]:
# fixing one location entry in the kaggle dataset
kaggle_data.loc[kaggle_data['location'] == 'Singapore, Singapore'] = 'Marina Bay, Singapore'

def cleanLocationsForJoin(strs):
    words = strs.split(',')
    return (', '.join([unidecode(words[0].strip()), unidecode(words[-1].strip())]))

# preparing data for joining
kaggle_data['merge_location'] = kaggle_data['location'].apply(cleanLocationsForJoin)
elevations_data['merge_location'] = elevations_data['location'].apply(cleanLocationsForJoin)

#dropping the original location column from elevations dataset by keeping in kaggle dataset to use later
elevations_data.drop(columns=['location'], inplace=True)

# removing meter(m) abbr. from the elevation column and changing data type to float
elevations_data['location_elevation'] = elevations_data['location_elevation'].apply(lambda x: float(x.split()[0]))

In [3]:
# printing the basic statistics of both datasets
def printColumnStats(dataset):
    for column in dataset.columns:
        print('{:30s}'.format(column), dataset[column].dtype)

# shape of kaggle dataset
print('Kaggle dataset shape:', '{}\n'.format(kaggle_data.shape))
printColumnStats(kaggle_data)
print('\n\n')

# shape of elevations dataset
print('Elevations dataset shape:', '{}\n'.format(elevations_data.shape))
printColumnStats(elevations_data)

Kaggle dataset shape: (5144, 146)

R_fighter                      object
B_fighter                      object
Referee                        object
date                           object
location                       object
Winner                         object
title_bout                     object
weight_class                   object
no_of_rounds                   object
B_current_lose_streak          object
B_current_win_streak           object
B_draw                         object
B_avg_BODY_att                 object
B_avg_BODY_landed              object
B_avg_CLINCH_att               object
B_avg_CLINCH_landed            object
B_avg_DISTANCE_att             object
B_avg_DISTANCE_landed          object
B_avg_GROUND_att               object
B_avg_GROUND_landed            object
B_avg_HEAD_att                 object
B_avg_HEAD_landed              object
B_avg_KD                       object
B_avg_LEG_att                  object
B_avg_LEG_landed               object
B_avg_PASS     

In [4]:
# join the two columns to add the elevations for locations
joined_data = kaggle_data.set_index('merge_location').join(elevations_data.set_index('merge_location')).reset_index()

#dropping the merge location column from joined dataset, the location is under location column
joined_data.drop(columns=['merge_location'], inplace=True)

# printing the stats of the newly joined dataset
print('Joined dataset shape:', '{}\n'.format(joined_data.shape))
printColumnStats(joined_data)

Joined dataset shape: (5144, 146)

R_fighter                      object
B_fighter                      object
Referee                        object
date                           object
location                       object
Winner                         object
title_bout                     object
weight_class                   object
no_of_rounds                   object
B_current_lose_streak          object
B_current_win_streak           object
B_draw                         object
B_avg_BODY_att                 object
B_avg_BODY_landed              object
B_avg_CLINCH_att               object
B_avg_CLINCH_landed            object
B_avg_DISTANCE_att             object
B_avg_DISTANCE_landed          object
B_avg_GROUND_att               object
B_avg_GROUND_landed            object
B_avg_HEAD_att                 object
B_avg_HEAD_landed              object
B_avg_KD                       object
B_avg_LEG_att                  object
B_avg_LEG_landed               object
B_avg_PASS     

In [5]:
# checking the integrity of the joined location elevations data
print('Empty Location Values:', joined_data['location_elevation'].isnull().any())

Empty Location Values: False


In [6]:
# creating a new csv
joined_data.to_csv('../ufcdata/data_with_location_elevation.csv')