# SCRATCH BOOK

### Transform congressional demographic data

In [2]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd


## PATHS ##
# ## 118 Congress Shapefile
# shape_path = os.path.join('..', 'data', 'vault', '118th_congress', 'USA_118th_Congressional_Districts.shp')
# ## Load Shapefile
# gdf = gpd.read_file(shape_path)

# Income data table - 5 Year Average 2022
income_path = os.path.join('..', 'data', 'vault', '118th_congress', 'income_data', 'ACSST5Y2022.S1903-Data.csv')
income_df = pd.read_csv(income_path, skiprows=1) # Load Income Data

# Summary table with Populations and Representative Names
summary_path = os.path.join('..', 'data', 'vault', 'USA_118th_Congressional_Districts_info_table.csv')
summary_df = pd.read_csv(summary_path)

# Check 
# gdf.head()
# income_df.head()
# summary_df.head()


Unnamed: 0,DISTRICTID,STFIPS,CDFIPS,STATE_ABBR,NAME,LAST_NAME,PARTY,SQMI,STATE_NAME,geometry
0,101,1,1,AL,Jerry Carl,Carl,Republican,5819.5,Alabama,"MULTIPOLYGON (((-88.07826 30.25212, -88.07830 ..."
1,102,1,2,AL,Barry Moore,Moore,Republican,10524.26,Alabama,"POLYGON ((-85.05603 32.06306, -85.05593 32.063..."
2,103,1,3,AL,Mike Rogers,Rogers,Republican,8456.48,Alabama,"POLYGON ((-85.51362 34.52383, -85.51304 34.521..."
3,104,1,4,AL,Robert B. Aderholt,Aderholt,Republican,9056.17,Alabama,"POLYGON ((-87.98492 35.00594, -87.98398 35.005..."
4,105,1,5,AL,Dale W. Strong,Strong,Republican,3501.97,Alabama,"MULTIPOLYGON (((-86.14981 34.53364, -86.14982 ..."


In [6]:
# Check 
gdf.head()
income_df.head()
# summary_df.head()

Unnamed: 0,Geography,Geographic Area Name,Estimate!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households,Margin of Error!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households,Estimate!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households!!One race--!!White,Margin of Error!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households!!One race--!!White,Estimate!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households!!One race--!!Black or African American,Margin of Error!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households!!One race--!!Black or African American,Estimate!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households!!One race--!!American Indian and Alaska Native,Margin of Error!!Number!!HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER!!Households!!One race--!!American Indian and Alaska Native,...,Margin of Error!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Female householder!!Living alone,Estimate!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Female householder!!Not living alone,Margin of Error!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Female householder!!Not living alone,Estimate!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Male householder,Margin of Error!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Male householder,Estimate!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Male householder!!Living alone,Margin of Error!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Male householder!!Living alone,Estimate!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Male householder!!Not living alone,Margin of Error!!Median income (dollars)!!NONFAMILY HOUSEHOLDS!!Nonfamily households!!Male householder!!Not living alone,Unnamed: 242
0,5001800US0101,"Congressional District 1 (118th Congress), Ala...",276035,1808,187444,1586,73517,1326,1752,292,...,1034,52211,5386,40815,1807,36278,2142,70927,10222,
1,5001800US0102,"Congressional District 2 (118th Congress), Ala...",274258,1657,177817,1365,82052,1364,914,166,...,1282,49929,5068,39959,1972,36471,2135,64596,6602,
2,5001800US0103,"Congressional District 3 (118th Congress), Ala...",275196,1731,193846,1450,67430,1263,435,122,...,775,38312,8323,34641,1937,31884,1791,53840,4796,
3,5001800US0104,"Congressional District 4 (118th Congress), Ala...",270389,1652,233228,1692,20483,809,1774,325,...,754,51038,4357,32940,2161,29012,1500,59071,3494,
4,5001800US0105,"Congressional District 5 (118th Congress), Ala...",287649,1510,215552,1942,51355,1190,1666,276,...,1543,56946,3261,51440,2531,46220,2283,78948,5836,


### Manipulate Image icons so they are all 300 x 300 px squares
- making sure they are all squares will make resizing issues easier later on
    - The aspect ratio is getting screwed up during resizing for icons that are not square

In [7]:
from PIL import Image, ImageOps
import os

# Directory where the logos are stored
logo_dir = os.path.join('..', 'images', 'logos')

# Make logos square by adding transparent space equally on both sides
for logo_file in os.listdir(logo_dir):
    logo_path = os.path.join(logo_dir, logo_file)
    
    # Check if the path is a file and not a directory
    if os.path.isfile(logo_path):
        with Image.open(logo_path) as img:
            # Ensure the image has an alpha channel (for transparency)
            img = img.convert("RGBA")
            
            width, height = img.size
            
            # If the image is already square, no changes are needed
            if width == height:
                continue
            
            # Calculate padding to add on the shorter side to make the image square
            if width > height:
                padding = (width - height) // 2
                new_img = ImageOps.expand(img, border=(0, padding, 0, padding), fill=(0, 0, 0, 0))
            else:
                padding = (height - width) // 2
                new_img = ImageOps.expand(img, border=(padding, 0, padding, 0), fill=(0, 0, 0, 0))
            
            # Save the padded square image, overwriting the original
            new_img.save(logo_path)

print("All logos made square by adding transparent space equally to each side.")


All logos made square by adding transparent space equally to each side.


## Verify the coordinates for the rinks in arena_info

In [4]:
# # Dependencies
# import os
# import requests
# import pandas as pd

# # Path to arena file
# arena_file = os.path.join('..','data', 'arena_school_info.csv')
# arena_df = pd.read_csv(arena_file)

# # Open Roster File To Clean State/Provences Names
# roster_file = os.path.join('..','data', 'roster_2024_current_v2.csv')
# roster_df = pd.read_csv(roster_file)

# roster_df.head()


Unnamed: 0,Current Team,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,Height_Inches,Draft_Year,NHL_Team,D_Round,Last Team,League,City,State_Province,Country
0,Lake Superior,Barone,Adam,6,Defensemen,Fr,6-1,174,5/6/2004,"Sault Ste. Marie, Ont.",73,,,,Trail,BCHL,Sault Ste. Marie,Ont.,Canada
1,Lake Superior,Blanchett,Jack,16,Defensemen,So,5-11,185,5/12/2003,"Monroe, Mich.",71,,,,Powell,BCHL,Monroe,Mich.,USA
2,Lake Superior,Brown,Mike,3,Defensemen,Jr,6-2,209,4/3/2001,"Belmont, Mass.",74,,,,Merrimack,,Belmont,Mass.,USA
3,Lake Superior,Bushy,Evan,5,Defensemen,So,6-1,195,3/26/2002,"Mankato, Minn.",73,,,,Trail,BCHL,Mankato,Minn.,USA
4,Lake Superior,Conrad,Jacob,4,Defensemen,Fr,5-11,180,5/18/2002,"Green Bay, Wis.",71,,,,Fairbanks,NAHL,Green Bay,Wis.,USA


In [5]:
# ## Get list of Unique State/Province Names
# unique_states = roster_df['State_Province'].unique()
# unique_states

# ## Dictionary to standardize state/province names

# standardized_locations = {
#     'Ont.': 'Ontario', 'Mich.': 'Michigan', 'Mass.': 'Massachusetts', 'Minn.': 'Minnesota', 
#     'Wis.': 'Wisconsin', 'Sweden': 'Sweden', 'Germany': 'Germany', 'B.C.': 'British Columbia',
#     'N.Y.': 'New York', 'Wash.': 'Washington', 'Que.': 'Quebec', 'Alb.': 'Alberta', 
#     'N.J.': 'New Jersey', 'Sask.': 'Saskatchewan', 'Conn.': 'Connecticut', 'Mo.': 'Missouri',
#     'Texas': 'Texas', 'Calif.': 'California', 'DC': 'District of Columbia', 'Fla.': 'Florida',
#     'Ohio': 'Ohio', 'Ill.': 'Illinois', 'Pa.': 'Pennsylvania', 'Ga.': 'Georgia',
#     'Mont.': 'Montana', 'Tenn.': 'Tennessee', 'Colo.': 'Colorado', 'Va.': 'Virginia', 
#     'Vt.': 'Vermont', 'R.I.': 'Rhode Island', 'Md.': 'Maryland', 'Ariz.': 'Arizona', 
#     'Wisc.': 'Wisconsin', 'Iowa': 'Iowa', 'Man.': 'Manitoba', 'Slovakia': 'Slovakia', 
#     'N.D.': 'North Dakota', 'N.C.': 'North Carolina', 'P.E.I.': 'Prince Edward Island',
#     'N.H.': 'New Hampshire', 'Alaska': 'Alaska', 'Belarus': 'Belarus', 'MB': 'Manitoba',
#     'Russia': 'Russia', 'Finland': 'Finland', 'Newf.': 'Newfoundland and Labrador', 
#     'Hungary': 'Hungary', 'SUI': 'Switzerland', 'S.C.': 'South Carolina', 'Latvia': 'Latvia',
#     'Czech Republic': 'Czech Republic', 'N.B.': 'New Brunswick', 'Great Britain': 'United Kingdom', 
#     'NB': 'New Brunswick', 'Norway': 'Norway', 'N.S.': 'Nova Scotia', 'Ind.': 'Indiana', 
#     'NWT': 'Northwest Territories', 'AUT': 'Austria', 'Idaho': 'Idaho', 'S.D.': 'South Dakota', 
#     'Switzerland': 'Switzerland', 'Ore.': 'Oregon', 'Wyo.': 'Wyoming', 'Utah': 'Utah', 
#     'ITA': 'Italy', 'Slovenia': 'Slovenia', 'YT': 'Yukon', 'Del.': 'Delaware', 'Maine': 'Maine',
#     'Poland': 'Poland', 'Yukon': 'Yukon', 'Ukraine': 'Ukraine', 'Japan': 'Japan', 'Neb.': 'Nebraska'
# }

# ## Apply the standardization to the State/Province column
# roster_df['State_Province'] = roster_df['State_Province'].replace(standardized_locations)

# # Check the unique values after standardization
# roster_df['State_Province'].unique()
# print(roster_df['State_Province'].unique())

['Ontario' 'Michigan' 'Massachusetts' 'Minnesota' 'Wisconsin' 'Sweden'
 'Germany' 'British Columbia' 'New York' 'Washington' 'Quebec' 'Alberta'
 'New Jersey' 'Saskatchewan' 'Connecticut' 'Missouri' 'Texas' 'California'
 'District of Columbia' 'Florida' 'Ohio' 'Illinois' 'Pennsylvania'
 'Georgia' 'Montana' 'Tennessee' 'Colorado' 'Virginia' 'Vermont'
 'Rhode Island' 'Maryland' 'Arizona' 'Iowa' 'Manitoba' 'Slovakia'
 'North Dakota' 'North Carolina' 'Prince Edward Island' 'New Hampshire'
 'Alaska' 'Belarus' 'Russia' 'Finland' 'Newfoundland and Labrador'
 'Hungary' 'Switzerland' 'South Carolina' 'Latvia' 'Czech Republic'
 'New Brunswick' 'United Kingdom' 'Norway' 'Nova Scotia' 'Indiana'
 'Northwest Territories' 'Austria' 'Idaho' 'South Dakota' 'Oregon'
 'Wyoming' 'Utah' 'Italy' 'Slovenia' 'Yukon' 'Delaware' 'Maine' 'Poland'
 'Ukraine' 'Japan' 'Nebraska']


In [6]:
## Output the cleaned roster to a new CSV file
cleaned_roster_file = os.path.join('..','data', 'roster_cleaned_state_prov_2024.csv')
roster_df.to_csv(cleaned_roster_file, index=False)

In [6]:
# import requests
# import pandas as pd

# import requests
# import pandas as pd

# # Define the function to check the location using Google Places API
# def check_location(lat, lng, api_key):
#     url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
#     params = {
#         'location': f'{lat},{lng}',
#         'radius': 500,  # Distance in meters from the provided coordinates
#         'type': 'stadium',  # Filter search to stadiums/arenas
#         'key': api_key
#     }
    
#     # Debugging: Print the URL and parameters
#     print(f"Requesting URL: {url}")
#     print(f"Parameters: {params}")
    
#     response = requests.get(url, params=params)
    
#     # Debugging: Print the response code and content
#     print(f"Response status code: {response.status_code}")
#     print(f"Response content: {response.text}")
    
#     if response.status_code == 200:
#         results = response.json().get('results')
#         if results:
#             return results[0].get('name'), results[0].get('vicinity')
#         else:
#             return None, "No results found"
#     else:
#         return None, f"API request failed with status {response.status_code}"

# # Define the function to verify coordinates in the DataFrame
# def verify_coordinates(df, api_key):
#     results = []
#     for index, row in df.iterrows():
#         lat = row['Latitude']
#         lng = row['Longitude']
#         arena_name = row['Arena']
        
#         # Debugging: Print the current coordinates and arena being checked
#         print(f"Checking coordinates for arena: {arena_name}")
#         print(f"Latitude: {lat}, Longitude: {lng}")
        
#         # Get the name and vicinity of the nearest stadium/arena
#         name, vicinity = check_location(lat, lng, api_key)
        
#         # Append the original data and verification results
#         results.append({
#             'Arena': arena_name,
#             'Latitude': lat,
#             'Longitude': lng,
#             'Google Places Name': name,
#             'Vicinity': vicinity
#         })
#     return pd.DataFrame(results)

# # Load your API key


# # Assuming arena_df is your DataFrame
# verified_df = verify_coordinates(arena_df, api_key)

# # Output the results
# print(verified_df)


In [7]:
# verified_df.head(10)

In [None]:
# ## Version 2 of Arena Location verifications
# ## Returns 5 closest Google Places to coordinates given

# import requests
# import pandas as pd

# # Define the function to check the 5 closest places using Google Places API
# def check_nearby_places(lat, lng, api_key):
#     url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
#     params = {
#         'location': f'{lat},{lng}',
#         'radius': 500,  # Distance in meters from the provided coordinates
#         'key': api_key
#     }
    
#     # Debugging: Print the URL and parameters being sent to the API
#     print(f"Requesting places near lat: {lat}, lng: {lng}")
#     print(f"Request URL: {url}")
#     print(f"Parameters: {params}")
    
#     response = requests.get(url, params=params)
    
#     # Debugging: Print the response status and content
#     print(f"Response status code: {response.status_code}")
#     print(f"Response content: {response.text}\n")  # This shows the full response from the API
    
#     if response.status_code == 200:
#         results = response.json().get('results')
#         if results:
#             # Return the top 5 closest places
#             return [(result.get('name'), result.get('vicinity')) for result in results[:5]]
#         else:
#             return [("None", "No results found")]
#     else:
#         return [("None", f"API request failed with status {response.status_code}")]

# # Define the function to verify coordinates and return the 5 closest places
# def verify_coordinates(df, api_key):
#     results = []
#     for index, row in df.iterrows():
#         lat = row['Latitude']
#         lng = row['Longitude']
#         arena_name = row['Arena']
#         school_name = row['School']
        
#         # Debugging: Print the current arena and coordinates being checked
#         print(f"\nChecking nearby places for arena: {arena_name} (School: {school_name})")
#         print(f"Latitude: {lat}, Longitude: {lng}")
        
#         # Get the 5 closest places
#         nearby_places = check_nearby_places(lat, lng, api_key)
        
#         # Add each place to the results, along with the original data
#         for place in nearby_places:
#             results.append({
#                 'Arena': arena_name,
#                 'School': school_name,
#                 'Latitude': lat,
#                 'Longitude': lng,
#                 'Google Places Name': place[0],
#                 'Vicinity': place[1]
#             })
            
#     return pd.DataFrame(results)

# # Load your API key
# api_key = ''

# # Assuming arena_df is your DataFrame
# verified_df = verify_coordinates(arena_df, api_key)

# # Output the results
# print(verified_df)


In [5]:
verified_df.head(10)

## OUTPUT TO TEMP FOLDER FOR MANUAL REVIEW
# output_file = os.path.join('..','TEMP', 'arena_school_info_place_checkV3.csv')
# verified_df.to_csv(output_file, index=False)

