In [None]:
### create school name matching dictionary

import pandas as pd
import numpy as np
import fuzzywuzzy
from fuzzywuzzy import process
import re # RegEx for string matching


In [None]:
# read in school name data

wiki_df = pd.read_csv('../data/cfb_d1_teams_with_coordinates.csv')
game_df = pd.read_csv('../data/cfb_scores_all_years.csv')
conf_df = pd.read_csv('../data/cfb_conference_members.csv')



In [None]:
wiki_df.head()
conf_df.head()
game_df.head()
# len(wiki_names)
# len(game_names)

In [None]:
## Cleaning Game Result Data

## Extract the ranking from the team name and store as seperate column
def extract_ranking(team_name):
    match = re.match(r'^\((\d{1,2})\)', team_name)
    return int(match.group(1)) if match else np.nan

game_df['Winner_Ranking'] = game_df['Winner'].apply(extract_ranking)
game_df['Loser_Ranking'] = game_df['Loser'].apply(extract_ranking)

# Clean the names
game_df['Winner'] = game_df['Winner'].str.replace(r'^\(\d{1,2}\)\s', '').str.strip()
game_df['Loser'] = game_df['Loser'].str.replace(r'^\(\d{1,2}\)\s', '').str.strip()

# Save the updated game results to a new csv - overwriting the old one
game_df.to_csv('../data/cfb_scores_all_years.csv', index=False)

In [None]:
## Manual adjustments to the wiki_conf_df dataframe 
# change conf_name to match wiki_name - conf names are overly conplicated - already have game data matched to wiki

adjustments = {'University of Louisiana at Monroe': 'Louisiana-Monroe',
               'Middle Tennessee State University': 'Middle Tennessee',
               'North Carolina State University': 'NC State',
               'University of North Carolina at Charlotte': 'Charlotte (NC)',
               'University of Illinois Urbana–Champaign': 'Illinois',               
                'University of Maryland, College Park': 'Maryland',
                'University of Wisconsin–Madison': 'Wisconsin',
                'Connecticut':'UConn',
                'University of Alabama at Birmingham': 'UAB',
                'University of North Carolina at Charlotte': 'Charlotte (NC)',
                'Arizona State Sun': 'Arizona State',
                'Brigham Young University': 'BYU',
                'Florida Atlantic University': 'FAU',
                'Florida International University': 'FIU',
                'Georgia Institute of Technology': 'Georgia Tech',
                'Indiana University Bloomington': 'Indiana',
                'University of California, Los Angeles': 'UCLA',
                'University of California, Berkeley': 'California',
                'University of Colorado, Boulder': 'Colorado',
                'University of Minnesota, Twin Cities': 'Minnesota',
                'University of Nebraska-“Lincoln': 'Nebraska',
                'University of North Carolina at Chapel Hill': 'North Carolina',
                'Virginia Polytechnic Institute and State University': 'Virginia Tech',
                'Virginia Tech *': 'Virginia Tech',
                'University of Alabama at Birmingham': 'Alabama-Birmingham (UAB)',
                'University of Miami': 'Miami (FL)',
                


}

# apply to the Institution column of the conf_df dataframe
conf_df['Institution'] = conf_df['Institution'].replace(adjustments)
# overwirtes the conf_names list with the adjusted names
conf_names = conf_df['Institution'].tolist()
               


In [None]:
####################################################
### MOVED TO CONF SCRAPE NOTEBOOK
# ## Clean school names from conference table to match wiki names

# # remove 'University of' and "University" from Institution column
# conf_df['Institution'] = conf_df['Institution'].str.replace('University of ', '')
# conf_df['Institution'] = conf_df['Institution'].str.replace('University', '')
# conf_df['Institution'] = conf_df['Institution'].str.strip()
# # remove any citations from Institution column
# conf_df['Institution'] = conf_df['Institution'].str.replace('\[.*\]', '')

# # Remove unessisary city names from Institution column 
# # drop dash and word after '- '
# conf_df['Institution'] = conf_df['Institution'].str.replace('- .*', '')
###############################################

conf_df.head()

# value counts of school names in conference table
conf_df['Institution'].value_counts()

# # sort alphabetically
# conf_df = conf_df.sort_values(by=['Institution'])

# # get a list of the cleaned names
# conf_names = conf_df['Institution'].tolist()
# # drop duplicates
# conf_names = list(dict.fromkeys(conf_names))

# # len(conf_names)

# clean wiki_df 'Team' column of citations
wiki_df['Team'] = wiki_df['Team'].str.replace('\[.*\]', '')
# drop the next word after a comma or a dash
wiki_df['Team'] = wiki_df['Team'].str.replace(',.*', '')
wiki_df['Team'] = wiki_df['Team'].str.replace('-.*', '')

# overwrite the wiki_names list with the cleaned names
wiki_names = wiki_df['Team'].tolist()



        

In [None]:
# unique names in conf_names
len(conf_names)

# sort alphabetically
# conf_names.sort()
# Temp csv




In [None]:
# Look for matches in wiki names to conference names
# create a dictionary of wiki names and their matches

wiki_conf_dict = {}

# Loop each wiki name and find the best match in the conference names - output a dataframe with bothe names and the score
for wiki_name in wiki_names:
    match = process.extractOne(wiki_name, conf_names)
    wiki_conf_dict[wiki_name] = match

wiki_conf_dict

# create a dataframe from the dictionary
wiki_conf_df = pd.DataFrame.from_dict(wiki_conf_dict, orient='index')

wiki_conf_df.head()

# Name and assign columns
wiki_conf_df.columns = ['conf_name', 'score']
# rename index column
wiki_conf_df.index.names = ['wiki_name']
wiki_conf_df.head()

# # histogram of scores
# # wiki_conf_df['score'].hist()

# # get all matches with a score of 95 or less 
# # wiki_conf_df = wiki_conf_df[wiki_conf_df['score'] <= 95]
# # save temp csv file
# # wiki_conf_df.to_csv('../TEMP/wiki_conf_match_3.csv')

# # # create a dictionary of all matches with a score of 95 or better
# wiki_conf_dict = wiki_conf_df.to_dict('index')
# wiki_conf_dict

# # # use the dictionary to create a new column in the conf_df dataframe and apply the name from the dictionary to the wiki_name in the conf_df dataframe where there is a match 90 or above
# conf_df['conf_name'] = conf_df['Institution'].map(lambda x: wiki_conf_dict[x]['conf_name'] if x in wiki_conf_dict else np.nan)

# # # apply the Name from the dictionary to the wiki_name in the conf_df dataframe where there is a match
# # conf_df['conf_name'] = conf_df['Institution'].map(lambda x: wiki_conf_dict[x]['conf_name'] if x in wiki_conf_dict else np.nan)

# conf_df.head()

# # show matching dataframe
# wiki_conf_df.head()









In [None]:
conf_df.sample(20)
# sort the matching dataframe by score
wiki_conf_df = wiki_conf_df.sort_values(by=['score'], ascending=False)
wiki_conf_df.tail(30)

# histogram of scores
# wiki_conf_df['score'].hist()

In [None]:
## Match from the Conf names to the wiki names to check for any missed matches


# create a dictionary of wiki names and their matches

conf_wiki_dict = {}

# Loop each wiki name and find the best match in the conference names - output a dataframe with bothe names and the score
for conf_name in conf_names:
    match = process.extractOne(conf_name, wiki_names)
    conf_wiki_dict[conf_name] = match

# Create a dataframe fromt he dictionary
conf_wiki_df = pd.DataFrame.from_dict(conf_wiki_dict, orient='index')
# Rename the columns
conf_wiki_df.columns = ['wiki_name', 'score']

# Sort by score
conf_wiki_df = conf_wiki_df.sort_values(by=['score'], ascending=False)
# conf_wiki_df = conf_wiki_df.sort_values(by=['score'], ascending=False)
conf_wiki_df.tail(20)

# Histogram of scores
conf_wiki_df['score'].hist()

# printe number of matches 90+ over matches below 90
print(len(conf_wiki_df[conf_wiki_df['score'] >= 95]))
print(len(conf_wiki_df[conf_wiki_df['score'] < 95]))
# percentage of matches 90+ over matches below 90
print(len(conf_wiki_df[conf_wiki_df['score'] >= 95]) / len(conf_wiki_df[conf_wiki_df['score'] < 95]))

In [None]:
test_df = conf_wiki_df.tail(80)

test_df.tail(40)

In [None]:
game_df.info()

In [None]:
##  GAME DATA AND WIKI DATA Need Fixes - 

# Create a dictionary of the manual adjustments
manual_adjustments = {
    'Texas-El Paso': 'UTEP',
    'Connecticut': 'UConn',
    'Central Florida': 'UCF',
    'Sam Houston State': 'Sam Houston',
    'Texas-San Antonio': 'UTSA',
    'Florida International': 'FIU',
    'Alabama-Birmingham': 'UAB',
    'Southern Methodist': 'SMU',
    'Southern California': 'USC',
    'Massachusetts': 'UMass',
    'Brigham Young': 'BYU',
    'Texas Christian': 'TCU',
    'Nevada-Las Vegas': 'UNLV',
    'Mississippi': 'Ole Miss',
    # "North Carolina": 'UNC',
    'North Carolina State': 'NC State',
    'Southern Mississippi': 'Southern Miss',
    
    'Middle Tennessee State': 'Middle Tennessee'
}

# Reverse the dictionary
reversed = {v: k for k, v in manual_adjustments.items()}    

# Replace the names in the winner and loser columns of the  game data using the manual adjustments
game_df['Winner'] = game_df['Winner'].replace(manual_adjustments)
game_df['Loser'] = game_df['Loser'].replace(manual_adjustments)

# create lists of the school names
wiki_names = wiki_df['Team'].tolist()
game_names = game_df['Winner'].tolist() + game_df['Loser'].tolist()
# Unique names from the game data
game_names = list(dict.fromkeys(game_names))



len(game_names)
#


In [None]:
# wiki_names

In [None]:
from fuzzywuzzy import process

# Function to get the best matching name using fuzzy matching
def get_best_match(name, choices):
    best_match, score = process.extractOne(name, choices)
    return best_match, score

# Mapping game names to wiki names using fuzzy matching
name_mapping = {}
score_mapping = {}  # to store the matching scores

for game_name in game_names:
    best_match, score = get_best_match(game_name, wiki_names)
    name_mapping[game_name] = best_match
    score_mapping[game_name] = score

#Create a dataframe with both names and the match score
name_match_df = pd.DataFrame.from_dict(name_mapping, orient='index', columns=['Matched_Name'])
name_match_df['Score'] = score_mapping.values()
# Sort by the match score
name_match_df.sort_values(by='Score', ascending=False, inplace=True)
name_match_df.head(10)
name_match_df.tail(10)

# histogram of the match scores
name_match_df['Score'].hist(bins=20)

In [None]:
name_match_df.tail(10)

In [None]:
import pandas as pd

# Convert dictionary to DataFrame and save to CSV
df_mapping = pd.DataFrame(list(name_mapping.items()), columns=['Game Name', 'Wiki Name'])
# Sort DataFrame alphabetically by game name
df_mapping.sort_values(by=['Game Name'], inplace=True)
df_mapping.to_csv('../TEMP/name_mapping_trial.csv', index=False)
