# SCRATCH BOOK

### Join 2023 Player Stats to 2024 Rosters

In [17]:
import pandas as pd
import numpy as np
import os
import sys

## Path to the data
roster_path = os.path.join("..", "data", "roster_2024_current_v3.csv")
stat_path = os.path.join("..", "data", "player_stats_2023_v1.csv")

# Load the data
roster_df = pd.read_csv(roster_path)
stat_df = pd.read_csv(stat_path)

# Check the data
roster_df.head()
# stat_df.head()

Unnamed: 0,Current Team,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,Height_Inches,Draft_Year,NHL_Team,D_Round,Last Team,League,City,State_Province,Country
0,Lake Superior,Barone,Adam,6,Defensemen,Fr,1-Jun,174,5/6/2004,"Sault Ste. Marie, Ont.",73,,,,Trail,BCHL,Sault Ste. Marie,Ontario,Canada
1,Lake Superior,Blanchett,Jack,16,Defensemen,So,11-May,185,5/12/2003,"Monroe, Mich.",71,,,,Powell,BCHL,Monroe,Michigan,USA
2,Lake Superior,Brown,Mike,3,Defensemen,Jr,2-Jun,209,4/3/2001,"Belmont, Mass.",74,,,,Merrimack,,Belmont,Massachusetts,USA
3,Lake Superior,Bushy,Evan,5,Defensemen,So,1-Jun,195,3/26/2002,"Mankato, Minn.",73,,,,Trail,BCHL,Mankato,Minnesota,USA
4,Lake Superior,Conrad,Jacob,4,Defensemen,Fr,11-May,180,5/18/2002,"Green Bay, Wis.",71,,,,Fairbanks,NAHL,Green Bay,Wisconsin,USA


#### Data Transform

In [18]:
# Split stats Clean_Player into first and last name
stat_df['First_Name'] = stat_df['Clean_Player'].str.split(" ").str[0]
stat_df['Last_Name'] = stat_df['Clean_Player'].str.split(" ").str[1:]


stat_df['Last_Name'] = stat_df['Last_Name'].str[0].str.replace('[','').str.replace(']','') # Remove the brackets from the last name
# Remove periods dashes ect from both names
stat_df['First_Name'] = stat_df['First_Name'].str.replace('.','').str.replace('-',' ')
stat_df['Last_Name'] = stat_df['Last_Name'].str.replace('.','').str.replace('-',' ')
roster_df['First_Name'] = roster_df['First_Name'].str.replace('.','').str.replace('-',' ')
roster_df['Last_Name'] = roster_df['Last_Name'].str.replace('.','').str.replace('-',' ')
# strip white space
stat_df['First_Name'] = stat_df['First_Name'].str.strip()
stat_df['Last_Name'] = stat_df['Last_Name'].str.strip()
roster_df['First_Name'] = roster_df['First_Name'].str.strip()
roster_df['Last_Name'] = roster_df['Last_Name'].str.strip()

# Rename Team to Team_2023 for clarity
stat_df.rename(columns={'Team':'Team_2023'}, inplace=True)
# Rename Current_Team to Team_2024 for clarity
roster_df.rename(columns={'Current Team':'Team_2024'}, inplace=True)

stat_df.head()
# OUTPUT THE DATA TO TEMP CSVs
roster_df.to_csv(os.path.join("..", "TEMP", "TEST_roster_2024_current_v4.csv"), index=False)
stat_df.to_csv(os.path.join("..", "TEMP", "TEST_player_stats_2023_v2.csv"), index=False)

In [19]:
## Try a quick merge
merged_df = pd.merge(roster_df, stat_df, left_on=['First_Name', 'Last_Name'], right_on=['First_Name', 'Last_Name'], how='outer', suffixes=('_2024', '_2023'))
merged_df.head()

# Print report of the merge
print(f"Number of players in the roster: {len(roster_df)}")
print(f"Number of players in the stats: {len(stat_df)}")
print(f"Number of players in the merged data: {len(merged_df)}")


# Find Number Number of players whos Team_2023 does not match Team_2024
mismatched_teams = merged_df[merged_df['Team_2023'] != merged_df['Team_2024']]
print(f"Number of players with mismatched teams: {len(mismatched_teams)}")




print(merged_df.info())
merged_df.head()

Number of players in the roster: 1820
Number of players in the stats: 1729
Number of players in the merged data: 2309
Number of players with mismatched teams: 1374
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2309 entries, 0 to 2308
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Team_2024       1825 non-null   object 
 1   Last_Name       2309 non-null   object 
 2   First_Name      2309 non-null   object 
 3   No              1825 non-null   float64
 4   Position        1825 non-null   object 
 5   Yr              1825 non-null   object 
 6   Ht              1822 non-null   object 
 7   Wt              1825 non-null   float64
 8   DOB             1735 non-null   object 
 9   Hometown        1825 non-null   object 
 10  Height_Inches   1825 non-null   float64
 11  Draft_Year      225 non-null    float64
 12  NHL_Team        225 non-null    object 
 13  D_Round         225 non-null    float64
 14  Last

Unnamed: 0,Team_2024,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,Country,Clean_Player,Team_2023,G,A,Pts,plus_minus,Sh,PIM,Games_Played
0,Long Island,Casperson,AJ,3.0,Defensemen,Jr,2-Jun,190.0,7/19/2001,"Flower Mound, Texas",...,USA,AJ Casperson,Long Island,0.0,1.0,1.0,1.0,7.0,2.0,12.0
1,Bentley,Hodges,AJ,20.0,Forwards,Gr,Jun-00,175.0,8/24/2001,"Littleton, Colo.",...,USA,A.J. Hodges,Bentley,6.0,9.0,15.0,-1.0,57.0,2.0,29.0
2,Bemidji State,Macaulay,AJ,12.0,Defensemen,Sr,9-May,185.0,4/12/2002,"Bonnyville, Alb.",...,Canada,A.J. Macaulay,Alaska,5.0,10.0,15.0,9.0,44.0,14.0,34.0
3,Quinnipiac,Bohlinger,Aaron,5.0,Defensemen,Gr,9-May,165.0,8/25/2000,"Walden, N.Y.",...,USA,Aaron Bohlinger,Massachusetts,3.0,5.0,8.0,1.0,22.0,4.0,34.0
4,Long Island,Grounds,Aaron,23.0,Forwards,Sr,2-Jun,190.0,12/24/1999,"Jamestown, N.D.",...,USA,Aaron Grounds,Long Island,1.0,2.0,3.0,-5.0,14.0,16.0,11.0


In [20]:
## Drop players who aren't playing this year (No Team_2024)
merged_df = merged_df.dropna(subset=['Team_2024'])
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1825 entries, 0 to 2308
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Team_2024       1825 non-null   object 
 1   Last_Name       1825 non-null   object 
 2   First_Name      1825 non-null   object 
 3   No              1825 non-null   float64
 4   Position        1825 non-null   object 
 5   Yr              1825 non-null   object 
 6   Ht              1822 non-null   object 
 7   Wt              1825 non-null   float64
 8   DOB             1735 non-null   object 
 9   Hometown        1825 non-null   object 
 10  Height_Inches   1825 non-null   float64
 11  Draft_Year      225 non-null    float64
 12  NHL_Team        225 non-null    object 
 13  D_Round         225 non-null    float64
 14  Last Team       1815 non-null   object 
 15  League          1767 non-null   object 
 16  City            1825 non-null   object 
 17  State_Province  1825 non-null   object

In [23]:


# Convert all number columns to int
int_columns = ['No', 'Height_Inches', 'Wt', 'Draft_Year', 'D_Round', 
               'G', 'A', 'Pts', 'plus_minus', 'Sh', 'PIM', 'Games_Played']

for col in int_columns:
    merged_df[col] = merged_df[col].astype('Int64')

merged_df.head()

Unnamed: 0,Team_2024,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,Country,Clean_Player,Team_2023,G,A,Pts,plus_minus,Sh,PIM,Games_Played
0,Long Island,Casperson,AJ,3,Defensemen,Jr,2-Jun,190,7/19/2001,"Flower Mound, Texas",...,USA,AJ Casperson,Long Island,0,1,1,1,7,2,12
1,Bentley,Hodges,AJ,20,Forwards,Gr,Jun-00,175,8/24/2001,"Littleton, Colo.",...,USA,A.J. Hodges,Bentley,6,9,15,-1,57,2,29
2,Bemidji State,Macaulay,AJ,12,Defensemen,Sr,9-May,185,4/12/2002,"Bonnyville, Alb.",...,Canada,A.J. Macaulay,Alaska,5,10,15,9,44,14,34
3,Quinnipiac,Bohlinger,Aaron,5,Defensemen,Gr,9-May,165,8/25/2000,"Walden, N.Y.",...,USA,Aaron Bohlinger,Massachusetts,3,5,8,1,22,4,34
4,Long Island,Grounds,Aaron,23,Forwards,Sr,2-Jun,190,12/24/1999,"Jamestown, N.D.",...,USA,Aaron Grounds,Long Island,1,2,3,-5,14,16,11


In [24]:
## OUTPUT CSV TO TEMP FOR INSPECTION
merged_df.to_csv(os.path.join("..", "data", "roster_2024_with_2023_stats.csv"), index=False)

### Transform congressional demographic data

In [None]:
# import pandas as pd
# import numpy as np
# import os
# import geopandas as gpd


# ## PATHS ##
# # ## 118 Congress Shapefile
# # shape_path = os.path.join('..', 'data', 'vault', '118th_congress', 'USA_118th_Congressional_Districts.shp')
# # ## Load Shapefile
# # gdf = gpd.read_file(shape_path)

# # Income data table - 5 Year Average 2022
# income_path = os.path.join('..', 'data', 'vault', '118th_congress', 'income_data', 'ACSST5Y2022.S1903-Data.csv')
# income_df = pd.read_csv(income_path, skiprows=1) # Load Income Data

# # Summary table with Populations and Representative Names
# summary_path = os.path.join('..', 'data', 'vault', 'USA_118th_Congressional_Districts_info_table.csv')
# summary_df = pd.read_csv(summary_path)

# # Check 
# # gdf.head()
# # income_df.head()
# # summary_df.head()


In [None]:
# # Check 
# gdf.head()
# income_df.head()
# # summary_df.head()

### Manipulate Image icons so they are all 300 x 300 px squares
- making sure they are all squares will make resizing issues easier later on
    - The aspect ratio is getting screwed up during resizing for icons that are not square

In [None]:
from PIL import Image, ImageOps
import os

# Directory where the logos are stored
logo_dir = os.path.join('..', 'images', 'logos')

# Make logos square by adding transparent space equally on both sides
for logo_file in os.listdir(logo_dir):
    logo_path = os.path.join(logo_dir, logo_file)
    
    # Check if the path is a file and not a directory
    if os.path.isfile(logo_path):
        with Image.open(logo_path) as img:
            # Ensure the image has an alpha channel (for transparency)
            img = img.convert("RGBA")
            
            width, height = img.size
            
            # If the image is already square, no changes are needed
            if width == height:
                continue
            
            # Calculate padding to add on the shorter side to make the image square
            if width > height:
                padding = (width - height) // 2
                new_img = ImageOps.expand(img, border=(0, padding, 0, padding), fill=(0, 0, 0, 0))
            else:
                padding = (height - width) // 2
                new_img = ImageOps.expand(img, border=(padding, 0, padding, 0), fill=(0, 0, 0, 0))
            
            # Save the padded square image, overwriting the original
            new_img.save(logo_path)

print("All logos made square by adding transparent space equally to each side.")


## Verify the coordinates for the rinks in arena_info

In [None]:
# # Dependencies
# import os
# import requests
# import pandas as pd

# # Path to arena file
# arena_file = os.path.join('..','data', 'arena_school_info.csv')
# arena_df = pd.read_csv(arena_file)

# # Open Roster File To Clean State/Provences Names
# roster_file = os.path.join('..','data', 'roster_2024_current_v2.csv')
# roster_df = pd.read_csv(roster_file)

# roster_df.head()


In [None]:
# ## Get list of Unique State/Province Names
# unique_states = roster_df['State_Province'].unique()
# unique_states

# ## Dictionary to standardize state/province names

# standardized_locations = {
#     'Ont.': 'Ontario', 'Mich.': 'Michigan', 'Mass.': 'Massachusetts', 'Minn.': 'Minnesota', 
#     'Wis.': 'Wisconsin', 'Sweden': 'Sweden', 'Germany': 'Germany', 'B.C.': 'British Columbia',
#     'N.Y.': 'New York', 'Wash.': 'Washington', 'Que.': 'Quebec', 'Alb.': 'Alberta', 
#     'N.J.': 'New Jersey', 'Sask.': 'Saskatchewan', 'Conn.': 'Connecticut', 'Mo.': 'Missouri',
#     'Texas': 'Texas', 'Calif.': 'California', 'DC': 'District of Columbia', 'Fla.': 'Florida',
#     'Ohio': 'Ohio', 'Ill.': 'Illinois', 'Pa.': 'Pennsylvania', 'Ga.': 'Georgia',
#     'Mont.': 'Montana', 'Tenn.': 'Tennessee', 'Colo.': 'Colorado', 'Va.': 'Virginia', 
#     'Vt.': 'Vermont', 'R.I.': 'Rhode Island', 'Md.': 'Maryland', 'Ariz.': 'Arizona', 
#     'Wisc.': 'Wisconsin', 'Iowa': 'Iowa', 'Man.': 'Manitoba', 'Slovakia': 'Slovakia', 
#     'N.D.': 'North Dakota', 'N.C.': 'North Carolina', 'P.E.I.': 'Prince Edward Island',
#     'N.H.': 'New Hampshire', 'Alaska': 'Alaska', 'Belarus': 'Belarus', 'MB': 'Manitoba',
#     'Russia': 'Russia', 'Finland': 'Finland', 'Newf.': 'Newfoundland and Labrador', 
#     'Hungary': 'Hungary', 'SUI': 'Switzerland', 'S.C.': 'South Carolina', 'Latvia': 'Latvia',
#     'Czech Republic': 'Czech Republic', 'N.B.': 'New Brunswick', 'Great Britain': 'United Kingdom', 
#     'NB': 'New Brunswick', 'Norway': 'Norway', 'N.S.': 'Nova Scotia', 'Ind.': 'Indiana', 
#     'NWT': 'Northwest Territories', 'AUT': 'Austria', 'Idaho': 'Idaho', 'S.D.': 'South Dakota', 
#     'Switzerland': 'Switzerland', 'Ore.': 'Oregon', 'Wyo.': 'Wyoming', 'Utah': 'Utah', 
#     'ITA': 'Italy', 'Slovenia': 'Slovenia', 'YT': 'Yukon', 'Del.': 'Delaware', 'Maine': 'Maine',
#     'Poland': 'Poland', 'Yukon': 'Yukon', 'Ukraine': 'Ukraine', 'Japan': 'Japan', 'Neb.': 'Nebraska'
# }

# ## Apply the standardization to the State/Province column
# roster_df['State_Province'] = roster_df['State_Province'].replace(standardized_locations)

# # Check the unique values after standardization
# roster_df['State_Province'].unique()
# print(roster_df['State_Province'].unique())

In [6]:
## Output the cleaned roster to a new CSV file
cleaned_roster_file = os.path.join('..','data', 'roster_cleaned_state_prov_2024.csv')
roster_df.to_csv(cleaned_roster_file, index=False)

In [6]:
# import requests
# import pandas as pd

# import requests
# import pandas as pd

# # Define the function to check the location using Google Places API
# def check_location(lat, lng, api_key):
#     url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
#     params = {
#         'location': f'{lat},{lng}',
#         'radius': 500,  # Distance in meters from the provided coordinates
#         'type': 'stadium',  # Filter search to stadiums/arenas
#         'key': api_key
#     }
    
#     # Debugging: Print the URL and parameters
#     print(f"Requesting URL: {url}")
#     print(f"Parameters: {params}")
    
#     response = requests.get(url, params=params)
    
#     # Debugging: Print the response code and content
#     print(f"Response status code: {response.status_code}")
#     print(f"Response content: {response.text}")
    
#     if response.status_code == 200:
#         results = response.json().get('results')
#         if results:
#             return results[0].get('name'), results[0].get('vicinity')
#         else:
#             return None, "No results found"
#     else:
#         return None, f"API request failed with status {response.status_code}"

# # Define the function to verify coordinates in the DataFrame
# def verify_coordinates(df, api_key):
#     results = []
#     for index, row in df.iterrows():
#         lat = row['Latitude']
#         lng = row['Longitude']
#         arena_name = row['Arena']
        
#         # Debugging: Print the current coordinates and arena being checked
#         print(f"Checking coordinates for arena: {arena_name}")
#         print(f"Latitude: {lat}, Longitude: {lng}")
        
#         # Get the name and vicinity of the nearest stadium/arena
#         name, vicinity = check_location(lat, lng, api_key)
        
#         # Append the original data and verification results
#         results.append({
#             'Arena': arena_name,
#             'Latitude': lat,
#             'Longitude': lng,
#             'Google Places Name': name,
#             'Vicinity': vicinity
#         })
#     return pd.DataFrame(results)

# # Load your API key


# # Assuming arena_df is your DataFrame
# verified_df = verify_coordinates(arena_df, api_key)

# # Output the results
# print(verified_df)


In [7]:
# verified_df.head(10)

In [None]:
# ## Version 2 of Arena Location verifications
# ## Returns 5 closest Google Places to coordinates given

# import requests
# import pandas as pd

# # Define the function to check the 5 closest places using Google Places API
# def check_nearby_places(lat, lng, api_key):
#     url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
#     params = {
#         'location': f'{lat},{lng}',
#         'radius': 500,  # Distance in meters from the provided coordinates
#         'key': api_key
#     }
    
#     # Debugging: Print the URL and parameters being sent to the API
#     print(f"Requesting places near lat: {lat}, lng: {lng}")
#     print(f"Request URL: {url}")
#     print(f"Parameters: {params}")
    
#     response = requests.get(url, params=params)
    
#     # Debugging: Print the response status and content
#     print(f"Response status code: {response.status_code}")
#     print(f"Response content: {response.text}\n")  # This shows the full response from the API
    
#     if response.status_code == 200:
#         results = response.json().get('results')
#         if results:
#             # Return the top 5 closest places
#             return [(result.get('name'), result.get('vicinity')) for result in results[:5]]
#         else:
#             return [("None", "No results found")]
#     else:
#         return [("None", f"API request failed with status {response.status_code}")]

# # Define the function to verify coordinates and return the 5 closest places
# def verify_coordinates(df, api_key):
#     results = []
#     for index, row in df.iterrows():
#         lat = row['Latitude']
#         lng = row['Longitude']
#         arena_name = row['Arena']
#         school_name = row['School']
        
#         # Debugging: Print the current arena and coordinates being checked
#         print(f"\nChecking nearby places for arena: {arena_name} (School: {school_name})")
#         print(f"Latitude: {lat}, Longitude: {lng}")
        
#         # Get the 5 closest places
#         nearby_places = check_nearby_places(lat, lng, api_key)
        
#         # Add each place to the results, along with the original data
#         for place in nearby_places:
#             results.append({
#                 'Arena': arena_name,
#                 'School': school_name,
#                 'Latitude': lat,
#                 'Longitude': lng,
#                 'Google Places Name': place[0],
#                 'Vicinity': place[1]
#             })
            
#     return pd.DataFrame(results)

# # Load your API key
# api_key = ''

# # Assuming arena_df is your DataFrame
# verified_df = verify_coordinates(arena_df, api_key)

# # Output the results
# print(verified_df)


In [5]:
verified_df.head(10)

## OUTPUT TO TEMP FOLDER FOR MANUAL REVIEW
# output_file = os.path.join('..','TEMP', 'arena_school_info_place_checkV3.csv')
# verified_df.to_csv(output_file, index=False)

