In [1]:
import requests
import json
import pandas as pd
import os
import datetime as dt 
from pprint import pprint
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func
import re
import warnings

warnings.filterwarnings('ignore')
regex = re.compile('[^a-zA-Z]')
#First parameter is the replacement, second parameter is your input string
regex.sub('', 'ab3d*E')


#was getting a psycopg2 error when dealing with numpy.int64 datatype
#these two functions fix the error. downloaded from stack overflow
import numpy
from psycopg2.extensions import register_adapter, AsIs
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)
def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)
register_adapter(numpy.float64, addapt_numpy_float64)
register_adapter(numpy.int64, addapt_numpy_int64)
base = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&units=standard&startdate='
#Loading secrets
%load_ext dotenv
%dotenv secrets.env
word = os.getenv('SQL_PASS')

In [2]:
stations_df = pd.read_csv('data_from_web/weather_stations.csv')
stations_df.drop(columns=['NCDCID', 'WBAN', 'COUNTY', 'ELEV', 'UTC', 'STNTYPE'], inplace=True) #dont need these
stations_df.dropna(how='any', inplace=True) #dropping alaskan station that has no GHCND 
stations_df.columns = [column.lower() for column in stations_df.columns]
stations_df.reset_index(inplace=True)
stations_df.rename(columns={'index':'station_id'}, inplace=True)
stations_df.head()

Unnamed: 0,station_id,name,country,st,lat,lon,ghcnd
0,0,EGBERT1W,CANADA,ON,44.2326,-79.781,CAW00064757
1,2,BETHEL87WNW,UNITEDSTATES,AK,61.3465,-164.0769,USW00026656
2,3,CORDOVA14ESE,UNITEDSTATES,AK,60.4731,-145.3542,USW00096405
3,4,DEADHORSE3S,UNITEDSTATES,AK,70.1618,-148.4644,USW00026565
4,5,DENALI27N,UNITEDSTATES,AK,63.452,-150.8747,USW00096408


In [3]:
stadium_df = pd.read_csv('data_from_web/another_stadiums.csv', delimiter=';')
stadium_df = stadium_df[['TEAM', 'NAME', 'Geo Point', 'ROOF_TYPE']]
#Adding missing data
stadium_df = stadium_df.append(pd.DataFrame({'TEAM': 'LOSANGELESCHARGERS/LOSANGELESRAMS', 'NAME':'Los Angeles Memorial Coliseum', 'Geo Point': '34.051,-118.1716', 'ROOF_TYPE':'Open'}, index=[31]))
#Comparing differences in latitudes and longitudes between stadium and every weather station to find closest
for index1,row1 in stadium_df.iterrows():
    #Resetting lat long comparison values for every row
    latDif = 10000
    longDif = 10000
    latitude, longitude = row1['Geo Point'].split(',')
    latitude = float(latitude)
    longitude= float(longitude)
    for index2, row2 in stations_df.iterrows():
        #Don't care if its negative or positive, just want to know the largest difference
        if abs(latitude-(row2['lat'])) < latDif and abs(longitude-row2['lon'])<longDif:
            latDif = abs(latitude-row2['lat'])
            longDif = abs(longitude-row2['lon'])
            stadium_df.loc[index1 , 'station'] = row2['ghcnd']
#Normalizing
team = []
for index, row in stadium_df.iterrows():
    team.append(row['TEAM'].upper().replace(' ',''))
stadium_df['TEAM'] = team
#Chicago Bears was misspelled in original data
stadium_df.loc[1, 'TEAM'] = 'CHICAGOBEARS'
stadium_df.rename(columns={'Geo Point': 'geo_point'}, inplace=True)
stadium_df.rename(columns={'TEAM':'team'}, inplace=True)
stadium_df.set_index('team', inplace=True)
stadium_df.head()

Unnamed: 0_level_0,NAME,geo_point,ROOF_TYPE,station
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DALLASCOWBOYS,Cowboys Stadium,"32.746930527,-97.0923739136",Retractable,USW00053961
CHICAGOBEARS,Soldier Field,"41.8625000675,-87.6167699762",Open,USW00054811
NEWENGLANDPATRIOTS,Gillette Stadium,"42.0918799131,-71.2649100654",Open,USW00054796
CAROLINAPANTHERS,Bank of America Stadium,"35.2258400005,-80.8533099799",Open,USW00092821
TAMPABAYBUCCANEERS,Raymond James Stadium,"27.978840052,-82.5034900566",Open,USW00092827


In [4]:
games_df = pd.read_csv('data_from_web/nfl_2018_games.csv')
games_df['Winner'] = games_df['Winner/tie']
games_df['Loser'] = games_df['Loser/tie']
#Determining who is home team for stadium lookup
hometeam = []
awayteam = []
for index,row in games_df.iterrows():
    if row[5]=='@':
        hometeam.append(row['Loser/tie'])
        awayteam.append(row['Winner/tie'])
    else:
        hometeam.append(row['Winner/tie'])
        awayteam.append(row['Loser/tie'])
games_df['Home Team'] = hometeam
games_df['Away Team'] = awayteam

#Dropping unneccessary columns and formatting
games_df.drop(columns=['Day', 'Unnamed: 5', 'Unnamed: 7', 'Winner/tie', 'Loser/tie', 'TOW', 'TOL'], inplace = True)
games_df.reset_index(inplace=True)
games_df.rename(columns={'index':'game_id'}, inplace=True)
games_df.set_index('game_id', inplace=True)

#Creating datetime objects to use for API request
date_obj = []
for index,row in games_df.iterrows():
    date_obj.append(dt.datetime.strptime(row['Date']+' 2018', '%B %d %Y'))
games_df['Datetime Obj'] = date_obj
games_df['year'] = [2018 for x in games_df['Date']]
#Normalizing team names and looking up stadium and weather station
stadium = []
station = []
for index,row in games_df.iterrows():
    team=''
    games_df.loc[index, 'Home Team'] = row['Home Team'].upper().replace(' ', '')
    games_df.loc[index, 'Winner'] = row['Winner'].upper().replace(' ', '')
    games_df.loc[index, 'Loser'] = row['Loser'].upper().replace(' ', '')
    games_df.loc[index, 'Away Team'] = row['Away Team'].upper().replace(' ', '')
    team = row['Home Team'].upper().replace(' ', '')
    #Due to SQL schema, teams that play at same stadium are lumped together in stadium database and dataframe
    if team == 'NEWYORKGIANTS' or team == 'NEWYORKJETS':
        team = 'NEWYORKGIANTS/NEWYORKJETS'
        stadium.append(stadium_df['NAME'].loc[team])
        station.append(stadium_df['station'].loc[team])
    elif team == 'LOSANGELESCHARGERS' or team == 'LOSANGELESRAMS':
        team = 'LOSANGELESCHARGERS/LOSANGELESRAMS'
        stadium.append(stadium_df['NAME'].loc[team])
        station.append(stadium_df['station'].loc[team])
    else:
        stadium.append(stadium_df['NAME'].loc[team])
        station.append(stadium_df['station'].loc[team])
games_df['Stadium'] = stadium
games_df['ghcnd'] = station
games_df['total_pts'] = games_df['PtsW'] + games_df['PtsL']
games_df['total_yds'] = games_df['YdsW'] + games_df['YdsL']
games_df.drop(columns=['Date', 'Time'], inplace=True)
#Normalizing column names
games_df.rename(columns={'Home Team': 'home_team',
                         'Away Team':'away_team',
                         'Datetime Obj': 'date'}, inplace=True)
games_df.head()


Unnamed: 0_level_0,Week,PtsW,PtsL,YdsW,YdsL,Winner,Loser,home_team,away_team,date,year,Stadium,ghcnd,total_pts,total_yds
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,1,18,12,232,299,PHILADELPHIAEAGLES,ATLANTAFALCONS,PHILADELPHIAEAGLES,ATLANTAFALCONS,2018-09-06,2018,Lincoln Financial Field,USW00003761,30,531
1,1,48,40,529,475,TAMPABAYBUCCANEERS,NEWORLEANSSAINTS,NEWORLEANSSAINTS,TAMPABAYBUCCANEERS,2018-09-09,2018,Louisiana Superdome,USW00053960,88,1004
2,1,24,16,343,327,MINNESOTAVIKINGS,SANFRANCISCO49ERS,MINNESOTAVIKINGS,SANFRANCISCO49ERS,2018-09-09,2018,Hubert H. Humphrey Metrodome,USW00054854,40,670
3,1,27,20,342,336,MIAMIDOLPHINS,TENNESSEETITANS,MIAMIDOLPHINS,TENNESSEETITANS,2018-09-09,2018,Sun Life Stadium,USW00092821,47,678
4,1,20,15,305,324,JACKSONVILLEJAGUARS,NEWYORKGIANTS,NEWYORKGIANTS,JACKSONVILLEJAGUARS,2018-09-09,2018,Meadowlands Stadium,USW00064756,35,629


In [5]:
# Requesting all station ids for a specific date, much quicker than original logic
x=1
for date in games_df['date'].unique():
    date_loc = games_df.loc[games_df['date'] == date]
    print(f'{date}\t{x}')
    x+=1
    stations = [ghcnd for ghcnd in date_loc['ghcnd']]
    station_string = ''
    for station in stations:
        station_string = station_string + f'&stationid=GHCND:{station}'
    head = {"token": os.getenv('NOAA_TOK')} #Based on API documentation, must include token in the header of request
    response = requests.get(f'{base}{date}&enddate={date}{station_string}', headers=head).json()
    for result in response['results']:
        return_station = result['station'].split(':')[1]
        if result['datatype']=='PRCP':
            games_df.loc[games_df['ghcnd'] == return_station, 'Rain'] = result['value']
        elif result['datatype']=='TMAX':
            games_df.loc[games_df['ghcnd'] == return_station, 'Temp'] = result['value']
games_df.head()

2018-09-06T00:00:00.000000000	1
2018-09-09T00:00:00.000000000	2
2018-09-10T00:00:00.000000000	3
2018-09-13T00:00:00.000000000	4
2018-09-16T00:00:00.000000000	5
2018-09-17T00:00:00.000000000	6
2018-09-20T00:00:00.000000000	7
2018-09-23T00:00:00.000000000	8
2018-09-24T00:00:00.000000000	9
2018-09-27T00:00:00.000000000	10
2018-09-30T00:00:00.000000000	11
2018-10-01T00:00:00.000000000	12
2018-10-04T00:00:00.000000000	13
2018-10-07T00:00:00.000000000	14
2018-10-08T00:00:00.000000000	15
2018-10-11T00:00:00.000000000	16
2018-10-14T00:00:00.000000000	17
2018-10-15T00:00:00.000000000	18
2018-10-18T00:00:00.000000000	19
2018-10-21T00:00:00.000000000	20
2018-10-22T00:00:00.000000000	21
2018-10-25T00:00:00.000000000	22
2018-10-28T00:00:00.000000000	23
2018-10-29T00:00:00.000000000	24
2018-11-01T00:00:00.000000000	25
2018-11-04T00:00:00.000000000	26
2018-11-05T00:00:00.000000000	27
2018-11-08T00:00:00.000000000	28
2018-11-11T00:00:00.000000000	29
2018-11-12T00:00:00.000000000	30
2018-11-15T00:00:00

Unnamed: 0_level_0,Week,PtsW,PtsL,YdsW,YdsL,Winner,Loser,home_team,away_team,date,year,Stadium,ghcnd,total_pts,total_yds,Rain,Temp
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,18,12,232,299,PHILADELPHIAEAGLES,ATLANTAFALCONS,PHILADELPHIAEAGLES,ATLANTAFALCONS,2018-09-06,2018,Lincoln Financial Field,USW00003761,30,531,0.0,44.0
1,1,48,40,529,475,TAMPABAYBUCCANEERS,NEWORLEANSSAINTS,NEWORLEANSSAINTS,TAMPABAYBUCCANEERS,2018-09-09,2018,Louisiana Superdome,USW00053960,88,1004,0.07,71.0
2,1,24,16,343,327,MINNESOTAVIKINGS,SANFRANCISCO49ERS,MINNESOTAVIKINGS,SANFRANCISCO49ERS,2018-09-09,2018,Hubert H. Humphrey Metrodome,USW00054854,40,670,0.06,29.0
3,1,27,20,342,336,MIAMIDOLPHINS,TENNESSEETITANS,MIAMIDOLPHINS,TENNESSEETITANS,2018-09-09,2018,Sun Life Stadium,USW00092821,47,678,0.0,64.0
4,1,20,15,305,324,JACKSONVILLEJAGUARS,NEWYORKGIANTS,NEWYORKGIANTS,JACKSONVILLEJAGUARS,2018-09-09,2018,Meadowlands Stadium,USW00064756,35,629,0.0,40.0


#Sending an API request for every game in game dataframe, takes some time

 -- THIS WAS MY ORIGINAL FORMAT FOR API REQUEST -- STABLE -- 


for index, row in games_df.iterrows():
    base = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&units=standard&startdate='
    #Formatting date to appropriate form
    startDate = row['date'].strftime('%Y-%m-%d')
    endDate = startDate #must include enddate in API url. Same enddate as start returns one day from API
    station=row['ghcnd']
    url = f'{base}{startDate}&enddate={endDate}&stationid=GHCND:{station}'
    head = {"token": os.getenv('NOAA_TOK')} #Based on API documentation, must include token in the header of request
    json = requests.get(url, headers=head).json()
    for result in json['results']:
        if result['datatype']=='PRCP':
            games_df.loc[index, 'Rain'] = result['value']
        elif result['datatype']=='TMAX':
            games_df.loc[index, 'Temp'] = result['value']
games_df.dropna(how='any', inplace=True) #There seems to be only one value returning NaN from the API, So i am dropping this record.
games_df.head()

In [6]:
#Initializing and Normalizing teams dataframe
teams_df = pd.read_csv('data_from_web/nfl_teams.csv')
teams_df = teams_df[['team_name', 'team_conference', 'team_division']]
for index, row in teams_df.iterrows():
    teams_df.loc[index, 'team_name'] = row['team_name'].upper().replace(' ','')
teams_df.columns = [column.lower() for column in teams_df.columns]
teams_df.reset_index(inplace=True)
teams_df.rename(columns={'index':'team_id'}, inplace=True)
teams_df.head()

Unnamed: 0,team_id,team_name,team_conference,team_division
0,0,ARIZONACARDINALS,NFC,NFC West
1,1,PHOENIXCARDINALS,NFC,
2,2,ST.LOUISCARDINALS,NFC,
3,3,ATLANTAFALCONS,NFC,NFC South
4,4,BALTIMORERAVENS,AFC,AFC North


In [7]:
#Normalizing all column names to be lower case before loading into SQL



games_df.columns = [column.lower() for column in games_df.columns]

stadium_df.columns = [column.lower() for column in stadium_df.columns]
stadium_df.reset_index(inplace=True)
stadium_df.reset_index(inplace=True)
stadium_df.rename(columns={'index':'stadium_id', 'team':'team_id', 'station':'station_id'}, inplace=True)


stadium_df.head()

Unnamed: 0,stadium_id,team_id,name,geo_point,roof_type,station_id
0,0,DALLASCOWBOYS,Cowboys Stadium,"32.746930527,-97.0923739136",Retractable,USW00053961
1,1,CHICAGOBEARS,Soldier Field,"41.8625000675,-87.6167699762",Open,USW00054811
2,2,NEWENGLANDPATRIOTS,Gillette Stadium,"42.0918799131,-71.2649100654",Open,USW00054796
3,3,CAROLINAPANTHERS,Bank of America Stadium,"35.2258400005,-80.8533099799",Open,USW00092821
4,4,TAMPABAYBUCCANEERS,Raymond James Stadium,"27.978840052,-82.5034900566",Open,USW00092827


In [8]:
games_df.reset_index(inplace=True)

games_df.rename(columns={'winner':'winner_id', 'loser':'loser_id', 'home_team':'home_team_id', 'away_team':'away_team_id', 'stadium':'stadium_id', 'ghcnd':'station_id'}, inplace=True)
x = 1
y = 1
#Creating ids across multiple tables
for indexg, rowg in games_df.iterrows():
    for indext, rowt in teams_df.iterrows():
        if rowg['winner_id'] == rowt['team_name']:
            games_df.loc[indexg, 'winner_id'] = teams_df.loc[indext, 'team_id']
        if rowg['loser_id'] == rowt['team_name']:
            games_df.loc[indexg, 'loser_id'] = teams_df.loc[indext, 'team_id']
        if rowg['home_team_id'] == rowt['team_name']:
            games_df.loc[indexg, 'home_team_id'] = teams_df.loc[indext, 'team_id']
        if rowg['away_team_id'] == rowt['team_name']:
            games_df.loc[indexg, 'away_team_id'] = teams_df.loc[indext, 'team_id']
    for indexs, rows in stadium_df.iterrows():
        if rowg['stadium_id'] == rows['name']:
            games_df.loc[indexg, 'stadium_id'] = stadium_df.loc[indexs, 'stadium_id']
    for indexst, rowst in stations_df.iterrows():
        if rowg['station_id'] == rowst['ghcnd']:
            games_df.loc[indexg, 'station_id'] = stations_df.loc[indexst, 'station_id']
    print(x)
    x+=1
for indexs, rows in stadium_df.iterrows():
    for indext, rowt in teams_df.iterrows():
        if rows['team_id'] == rowt['team_name']:
            stadium_df.loc[indexs, 'team_id'] = teams_df.loc[indext, 'team_id']
        elif rows['team_id'] == 'NEWYORKGIANTS/NEWYORKJETS':
            stadium_df.loc[indexs, 'team_id'] = 29
        elif rows['team_id']=='LOSANGELESCHARGERS/LOSANGELESRAMS':
            stadium_df.loc[indexs, 'team_id'] = 21
    for indexst, rowst in stations_df.iterrows():
        if rows['station_id'] == rowst['ghcnd']:
            stadium_df.loc[indexs, 'station_id'] = stations_df.loc[indexst, 'station_id']
    print(y)
    y+=1
games_df.head()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


Unnamed: 0,game_id,week,ptsw,ptsl,ydsw,ydsl,winner_id,loser_id,home_team_id,away_team_id,date,year,stadium_id,station_id,total_pts,total_yds,rain,temp
0,0,1,18,12,232,299,33,3,33,3,2018-09-06,2018,8,111,30,531,0.0,44.0
1,1,1,48,40,529,475,37,27,27,37,2018-09-09,2018,25,64,88,1004,0.07,71.0
2,2,1,24,16,343,327,24,36,24,36,2018-09-09,2018,13,69,40,670,0.06,29.0
3,3,1,27,20,342,336,23,40,23,40,2018-09-09,2018,14,47,47,678,0.0,64.0
4,4,1,20,15,305,324,17,28,28,17,2018-09-09,2018,11,101,35,629,0.0,40.0


In [9]:
engine = create_engine(f'postgresql://postgres:{word}@localhost:5432/nfl_2018_db')

try:
    stations_df.to_sql('Stations', engine, if_exists='fail',index=False)
    teams_df.to_sql('Teams', engine, if_exists='fail',index=False)
    stadium_df.to_sql('Stadiums', engine, if_exists='fail',index=False)
    games_df.to_sql('Games', engine, if_exists='fail',index=False)
    engine.execute('ALTER TABLE "Games" ADD CONSTRAINT "pk_Games" PRIMARY KEY("game_id");')
    engine.execute('ALTER TABLE "Teams" ADD CONSTRAINT "pk_Teams" PRIMARY KEY("team_id");')
    engine.execute('ALTER TABLE "Stadiums" ADD CONSTRAINT "pk_Stadiums" PRIMARY KEY ("stadium_id");')
    engine.execute('ALTER TABLE "Stations" ADD CONSTRAINT "pk_Stations" PRIMARY KEY ("station_id");')
except ValueError:
    print('Tables Exist')



Tables Exist


In [10]:

engine = create_engine(f'postgresql://postgres:{word}@localhost:5432/nfl_2018_db')
Base = automap_base()
Base.prepare(engine, reflect=True)
session = Session(engine)
Teams = Base.classes.Teams
Stadiums = Base.classes.Stadiums
Stations = Base.classes.Stations
Games = Base.classes.Games
Base.classes.keys()


['Games', 'Seasons_Team_Stats', 'Teams', 'Stadiums', 'Stations']

In [11]:
master_league = pd.DataFrame(columns=[['Tm', 'W', 'L', 'T', 'PF', 'PA', 'PD', 'team_id', 'year']]) 
first_time = True
for year in range(2010,2019,1):
    tables = pd.read_html(f'https://www.pro-football-reference.com/years/{year}/')
    afc = tables[0]
    nfc = tables[1]
    try:
        afc = afc[['Tm', 'W', 'L', 'T', 'PF', 'PA', 'PD']]
        nfc = nfc[['Tm', 'W', 'L', 'T', 'PF', 'PA', 'PD']]
    except KeyError:
        try:
            afc = afc[['Tm', 'W', 'L', 'PF', 'PA', 'PD']]
            nfc = nfc[['Tm', 'W', 'L', 'PF', 'PA', 'PD']]
        except KeyError:
            print('nope')
    
    for index, row in afc.iterrows():
        afc.loc[index, 'Tm'] = regex.sub('', row['Tm'])
        if 'AFC' in row['Tm']:
            afc.drop(index, inplace=True)
        if 'St. Lo' in row['Tm']:
            nfc.loc[index,'Tm'] = 'St.LouisRams'
        if 'SanFran' in row['Tm']:
            afc.loc[index, 'Tm'] = 'SanFrancisco49ers'
    for index, row in afc.iterrows():
        x = row['Tm'].upper().replace(' ','')
        query_string = f'{x}'
        query = session.query(Teams.team_id).filter(Teams.team_name.match(query_string))
        compreh = [result[0] for result in query]
        for result in query:
            afc.loc[index,'index'] = int(compreh[0])

        
    for index, row in nfc.iterrows():
        nfc.loc[index, 'Tm'] = regex.sub('', row['Tm'])
        if 'NFC' in row['Tm']:
            nfc.drop(index, inplace=True)
        if 'St. Lo' in row['Tm']:
            nfc.loc[index,'Tm'] = 'St.LouisRams'
        if 'San Fran' in row['Tm']:
            nfc.loc[index, 'Tm'] = 'SanFrancisco49ers'
    for index, row in nfc.iterrows():
        x = row['Tm']
        query_string = f'{x}'
        query = session.query(Teams.team_id).filter(Teams.team_name.match(query_string))
        compreh = [result[0] for result in query]
        for result in query:
            nfc.loc[index,'index'] = int(compreh[0])
    afc.reset_index(inplace=True)
    afc.rename(columns={'index': 'team_id'}, inplace=True)
    # nfc.reset_index(inplace=True)
    nfc.rename(columns={'index': 'team_id'},inplace=True)

    # Need to add functionality to append teams to SQL db
    nfc.dropna(how='any', inplace=True)
    afc.dropna(how='any', inplace=True)
    league = pd.merge(nfc, afc, how='outer')
    try:
        league.drop(columns='level_0', inplace=True)
    except KeyError:
        print('no level_0 this time')
    columns = [regex.sub('', column) for column in league.columns]
    league.columns = columns
    league.rename(columns={'teamid':'team_id'},inplace=True)
    league.fillna(value=0.2)
    league['team_id'] = league['team_id'].astype(int)
    league['year'] = [year for x in league['team_id'] ]
    if first_time:
        master_league = league
        first_time = False
        print(f'First Time!\nYear: {year}')
        count = 1
    else:
        master_league = master_league.append(league)
        print(f'Year: {year}')
    master_league.fillna(0.2, inplace=True)
    if master_league.loc[master_league['team_id'] == 0.2].empty:
#         print(master_league)
            kevin='kevin'


    else:
        print('Team ID is Missing')
        [print(x) for x in master_league['Tm'].loc[[master_league['team_id' == 0.2]]]]


First Time!
Year: 2010
Year: 2011
Year: 2012
Year: 2013
Year: 2014
Year: 2015
Year: 2016
Year: 2017
Year: 2018


In [12]:
try:
    master_league.to_sql('Seasons_Team_Stats',engine, if_exists='fail', index=False)
    engine.execute('ALTER TABLE "Seasons_Team_Stats" ADD CONSTRAINT "pk_Seasons_Team_Stats" PRIMARY KEY ("team_id","year")')
    engine.execute(' ALTER TABLE "Games" ADD CONSTRAINT "fk_Games_winner_id_year" FOREIGN KEY("winner_id", "year") REFERENCES "Seasons_Team_Stats" ("team_id", "year"); ALTER TABLE "Games" ADD CONSTRAINT "fk_Games_loser_id" FOREIGN KEY("loser_id", "year") REFERENCES "Seasons_Team_Stats" ("team_id", "year"); ALTER TABLE "Games" ADD CONSTRAINT "fk_Games_home_team_id" FOREIGN KEY("home_team_id", "year") REFERENCES "Seasons_Team_Stats" ("team_id", "year"); ALTER TABLE "Games" ADD CONSTRAINT "fk_Games_away_team_id" FOREIGN KEY("away_team_id", "year") REFERENCES "Seasons_Team_Stats" ("team_id", "year"); ALTER TABLE "Games" ADD CONSTRAINT "fk_Games_stadium_id" FOREIGN KEY("stadium_id") REFERENCES "Stadiums" ("stadium_id"); ALTER TABLE "Games" ADD CONSTRAINT "fk_Games_station_id" FOREIGN KEY("station_id") REFERENCES "Stations" ("station_id"); ALTER TABLE "Stadiums" ADD CONSTRAINT "fk_Stadiums_team_id" FOREIGN KEY("team_id") REFERENCES "Teams" ("team_id"); ALTER TABLE "Stadiums" ADD CONSTRAINT "fk_Stadiums_station_id" FOREIGN KEY("station_id") REFERENCES "Stations" ("station_id"); ALTER TABLE "Seasons_Team_Stats" ADD CONSTRAINT "fk_Seasons_Team_Stats_team_id" FOREIGN KEY("team_id") REFERENCES "Teams" ("team_id");')
except ValueError:
    print('Tables already there')

Tables already there


In [13]:
#Connecting session
session = Session(engine)

In [14]:
#Proof of concept
query = session.query(func.avg(Games.total_pts), func.avg(Games.total_yds)).filter(Games.temp>30)
for result in query:
    print(f'Temp >30\nAvg Pts:{round(result[0],2)} Avg Yds: {round(result[1], 2)}')

Temp >30
Avg Pts:46.85 Avg Yds: 707.43


In [15]:
query = session.query(func.avg(Games.total_pts), func.avg(Games.total_yds)).filter(Games.temp<30)
for result in query:
    print(f'Temp < 30\nAvg Pts:{round(result[0],2)} Avg Yds: {round(result[1], 2)}')

Temp < 30
Avg Pts:41.62 Avg Yds: 611.88


In [16]:
session.close_all()