In [522]:
#Import / Load Lib
import requests
import pandas as pd
import numpy as np
import fuzzywuzzy.process as fwp
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [523]:
#Task 1 - Part 1 
#Part 1 of this task is to take these NHL player IDs, and extract only the following basic biographical 
    #information from their player profile on NHL.com: last name, first name, position, shooting/catching hand, 
    #height, weight, and date of birth.

url = 'https://records.nhl.com/site/api/draft'
r = requests.get(url)
json = r.json()
json.keys()

df = pd.DataFrame(json['data'])
df_1 = df.query("draftYear == 2020 and lastName != 'Void'")[['csPlayerId','lastName', 'firstName','playerName',
                                                             'position', 'shootsCatches', 'height', 'weight', 
                                                             'birthDate', 'overallPickNumber', 'pickInRound', 'roundNumber']]
#Check Data Types
df_1['csPlayerId'] = df_1['csPlayerId'].astype(np.int64)
df_1['height'] = df_1['height'].astype(np.int64)
df_1['weight'] = df_1['weight'].astype(np.int64)

#df_1.dtypes
#print(df_1) - display(df_1)

In [524]:
#Task 1 - Part 2
#Part 2 of your task is to take the results of Part 1 and add to them using the 2020 NHL Entry Draft page 
    #on Hockey-Reference.com. From here, youâ€™ll need to acquire the following information about each player: 
    #Round, Overall Pick Number, Drafting Team, and the team they were drafted from. 
    
#https://www.hockey-reference.com/draft/NHL_2020_entry.html#stats    

url = "https://www.hockey-reference.com/draft/NHL_2020_entry.html#stats"
tables = pd.read_html(url) 
tables = pd.read_html(url,header=1)

df_2 = pd.DataFrame(tables[0])
df_2 = df_2.dropna(subset=['Overall'])
df_2 = df_2.query("Overall != 'Overall'")

#Check Data Types
df_2['Overall'] = df_2['Overall'].astype(np.int64)
df_2['Age'] = df_2['Age'].astype(np.int64)


#df_2.dtypes
#print(df_2) - display(df_2)

In [525]:
#Note: you cannot use the draft selection information from the NHL.com player profile to match players to 
#the Hockey-Reference draft page, you must match players up by other means.

#Use Fuzzy Wuzzy

def fuzzy_merge(df_1, df_2, playerName, Player, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param playerName: key column of the left table
    :param Player: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[Player].tolist()
    
    m = df_1[playerName].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [526]:
#Fuzzy Wussy Cont...
df_fuz = fuzzy_merge(df_1, df_2, 'playerName', 'Player', threshold=90)
df_fuzzy = pd.merge(df_2, df_fuz, left_on=['Player', 'Pos'], right_on=['matches', 'position'])
df_final = pd.merge(df_1, df_2, left_on='overallPickNumber', right_on='Overall')


In [527]:
df_fuzzy = pd.merge(df_2, df_fuz, left_on=['Player', 'Pos'], right_on=['matches', 'position'])

df_final = pd.merge(df_1, df_2, left_on='overallPickNumber', right_on='Overall')

In [528]:
#Rename Columns 
df_fin = df_final[['csPlayerId','lastName','firstName', 'shootsCatches', 'height', 'weight', 'birthDate',
                   'roundNumber', 'overallPickNumber', 'Team', 'Amateur Team']]

df_fin.columns = ['Player_ID', 'Last_Name', 'First_Name', 'Shooting_Catching_Hand', 'Height', 'Weight', 'DOB', 'Round', 
                   'Overall_Pick_Number', 'Drafting_Team', 'Drafted_From_Team']

In [529]:
df_fin.to_csv("Katherine_Pearson_Task_1.csv", index = False)

In [530]:
#pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
