In [1]:
## Dependencies

import pandas as pd
from keplergl import KeplerGl
import os



## Table with overall stats for all years
stats = '../data/year_by_year_summary/overall/all_years_summary.csv'

## Team / School Info from the MHSAA Website
info = '../data/MHSAA_School_Data_with_Fields.csv'


## Load data into dataframes
stats_df = pd.read_csv(stats)
info_df = pd.read_csv(info)

## See the data

print(f'Overall Stats Table Shape: {stats_df.shape}')
print(f'Column Names: {stats_df.columns}')
      

print(f'MHSAA School Lookup Table Shape: {info_df.shape}')
print(f'Column Names: {info_df.columns}')



Overall Stats Table Shape: (715, 41)
Column Names: Index(['teamName', 'teamId', 'team_record_count', 'playoff_games_played',
       'playoff_W', 'playoff_L', 'playoff_runs_scored_total',
       'playoff_runs_allowed_total', 'regular_season_games_played',
       'regular_season_W', 'regular_season_L',
       'regular_season_runs_scored_total', 'regular_season_runs_allowed_total',
       'home_games_played', 'home_W', 'home_L', 'home_runs_scored_total',
       'home_runs_allowed_total', 'away_games_played', 'away_W', 'away_L',
       'away_runs_scored_total', 'away_runs_allowed_total', 'playoff_win_pct',
       'regular_season_win_pct', 'home_win_pct', 'away_win_pct',
       'playoff_run_diff_total', 'regular_season_run_diff_total',
       'home_run_diff_total', 'away_run_diff_total', 'playoff_run_diff_mean',
       'regular_season_run_diff_mean', 'home_run_diff_mean',
       'away_run_diff_mean', 'playoff_run_diff_per_game',
       'regular_season_run_diff_per_game', 'home_run_diff_per_

In [2]:
### See how many matches I can get between the two tables

# Columns to match on (teamId, SchoolId)
# Pull data from the info_df table to stats_df table

# Merge the two tables
merged_df = pd.merge(stats_df, info_df, how='left', left_on='teamName', right_on='PopularName')

# See how many matches I got
print(f'Number of matches: {merged_df.shape[0]}')

## See some infor about the merged table
print(f'Merged Table Shape: {merged_df.shape}')
print(f'Column Names: {merged_df.columns}')

# See the data
merged_df.head()


Number of matches: 715
Merged Table Shape: (715, 81)
Column Names: Index(['teamName', 'teamId', 'team_record_count', 'playoff_games_played',
       'playoff_W', 'playoff_L', 'playoff_runs_scored_total',
       'playoff_runs_allowed_total', 'regular_season_games_played',
       'regular_season_W', 'regular_season_L',
       'regular_season_runs_scored_total', 'regular_season_runs_allowed_total',
       'home_games_played', 'home_W', 'home_L', 'home_runs_scored_total',
       'home_runs_allowed_total', 'away_games_played', 'away_W', 'away_L',
       'away_runs_scored_total', 'away_runs_allowed_total', 'playoff_win_pct',
       'regular_season_win_pct', 'home_win_pct', 'away_win_pct',
       'playoff_run_diff_total', 'regular_season_run_diff_total',
       'home_run_diff_total', 'away_run_diff_total', 'playoff_run_diff_mean',
       'regular_season_run_diff_mean', 'home_run_diff_mean',
       'away_run_diff_mean', 'playoff_run_diff_per_game',
       'regular_season_run_diff_per_game', 'ho

Unnamed: 0,teamName,teamId,team_record_count,playoff_games_played,playoff_W,playoff_L,playoff_runs_scored_total,playoff_runs_allowed_total,regular_season_games_played,regular_season_W,...,OldSchoolId,SchoolNameWithId,Lat,Lng,Closest_Field_1,Closest_Field_2,Closest_Field_3,Distance_1,Distance_2,Distance_3
0,Ada Forest Hills Eastern,1976708,6,20,15.0,5.0,117.0,40.0,82.0,64.0,...,4506.0,Ada Forest Hills Eastern (4506),43.004934,-85.525979,Forrest Hills Eastern High School,Grand Rapids Forest Hills Northern HS,Grand Rapids Catholic Central HS,0.14829,2.769375,3.951781
1,Addison,95871,10,14,4.0,10.0,50.0,84.0,164.0,61.0,...,3037.0,Addison (3037),41.988315,-84.343935,Addison HS,Brooklyn Columbia Central HS - high_school,Onsted HS,0.249674,7.583523,7.860025
2,Adrian,95800,10,21,11.0,10.0,91.0,99.0,145.0,78.0,...,2294.0,Adrian (2294),41.909697,-84.045983,Adrian HS,Adrian Lenawee Christian HS,Ann Arbor Greenhills HS? - Practice?,0.225348,1.942878,3.371869
3,Adrian Lenawee Christian,96297,10,21,11.0,10.0,151.0,99.0,60.0,16.0,...,8698.0,Adrian Lenawee Christian (8698),41.908124,-84.081317,Adrian Lenawee Christian HS,Adrian HS,Ann Arbor Greenhills HS? - Practice?,0.132633,2.040508,4.48032
4,Adrian Madison,95746,10,13,3.0,10.0,45.0,88.0,100.0,46.0,...,1692.0,Adrian Madison (1692),41.867936,-84.018374,Ann Arbor Greenhills HS? - Practice?,Adrian HS,Adrian Lenawee Christian HS,0.243418,3.090306,4.389221


In [3]:
### Important columns to keep for the map plot

# School Name
# School City
# School Lat
# School Lng
# playoff_W
# regular_season_win_pct
# playoff_win_pct
# playoff_run_diff_mean
# regular_season_run_diff_mean
# Colors
# PrimaryColorCode
# LeaugeName
# LeaugeId
# PopularName
# NickName

# Create a new dataframe with only the columns I want
map_df = merged_df[['teamName', 'City', 'Lat', 'Lng', 'playoff_W', 
                    'regular_season_win_pct', 'playoff_win_pct', 
                    'playoff_run_diff_mean', 'regular_season_run_diff_mean', 
                    'Colors', 'PrimaryColorCode', 
                    'LeaugeName', 'LeaugeId', 'PopularName', 'NickName']]

## Change all column names to lower case
map_df.columns = map_df.columns.str.lower()

# rename lng to lon to work easier with the library
map_df.rename(columns={'lng': 'lon'}, inplace=True)

# See the data
map_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df.rename(columns={'lng': 'lon'}, inplace=True)


Unnamed: 0,teamname,city,lat,lon,playoff_w,regular_season_win_pct,playoff_win_pct,playoff_run_diff_mean,regular_season_run_diff_mean,colors,primarycolorcode,leaugename,leaugeid,popularname,nickname
0,Ada Forest Hills Eastern,Ada,43.004934,-85.525979,15.0,0.780488,0.75,3.85,4.682927,Crimson & Black,#003366,OK Conference,6963.0,Ada Forest Hills Eastern,Hawks
1,Addison,Addison,41.988315,-84.343935,4.0,0.371951,0.285714,-2.428571,-1.72561,Black & Red,#003366,Cascades Conference,6900.0,Addison,Panthers
2,Adrian,Adrian,41.909697,-84.045983,11.0,0.537931,0.52381,-0.380952,0.668966,Royal Blue & White,#003366,Southeastern Conference,6979.0,Adrian,Maples
3,Adrian Lenawee Christian,Adrian,41.908124,-84.081317,11.0,0.266667,0.52381,2.47619,-4.983333,Royal Blue/White/Scarlet,#003366,Southern Central Athletic Association,6980.0,Adrian Lenawee Christian,Cougars
4,Adrian Madison,Adrian,41.867936,-84.018374,3.0,0.46,0.230769,-3.307692,0.79,Blue & Gold,#003366,,0.0,Adrian Madison,Trojans


In [4]:
## Rename the map_df to df to make it easier to work with
df = map_df



In [5]:
# Create a new map
map_1 = KeplerGl(height=500)

# Add data to map
map_1.add_data(data=df, name='baseball_teams')


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [6]:
# Normalize 'playoff_W' to range [0, 10]
df['norm_playoff_W'] = df['playoff_w'] / df['playoff_w'].max() * 10

config = {
    'version': 'v1',
    'config': {
        'mapState': {
            'latitude': df['lat'].mean(),
            'longitude': df['lon'].mean(),
            'zoom': 6
        },
        'layers': [{
            'type': 'column',
            'dataId': 'baseball_teams',
            'columns': {
                'lat': 'lat',
                'lng': 'lon'
            },
            'is_3d': True,
            'visualChannels': {
                'heightField': 'norm_playoff_w',
                'heightScale': 'log'
            }
        }]
    }
}

map_1 = KeplerGl(height=700, widthe=700, config=config)
map_1.add_data(data=df, name='baseball_teams')
# map_1



User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['norm_playoff_W'] = df['playoff_w'] / df['playoff_w'].max() * 10


In [8]:
map_1

KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': 'i1d6iqb', 'type': …