In [1]:
import boto3
import botocore
import pandas as pd
#declare bucket as a constant to use with AWS
BUCKET = "mindex-data-analytics-code-challenge"

#list of files to grab
files = ['bengals.csv', 'boyd_receiving.csv', 'chase_receiving.csv', 'higgins_receiving.csv']

#Ideally you'd want to use a DAO to handle connections
#For the sake of simplicity, I'll handle this within some util functions

#define the s3 resource object
s3 = boto3.resource('s3')

#Download util so I can take a look at the csv's locally
def download_csv(files):
    for file in files:
        #Keep the files name the same as the key
        try:
            s3.Bucket(BUCKET).download_file(file,file)
        except botocore.exceptions.ClientError as e:
            print('error downloading ' + file)
            print(e)

#Util for placing csv directly into a dataframe from s3
def get_dataframe(file):
    #There are some optional libraries that would let you read in a DF like:
    #pd.read_csv('s3://'+BUCKET+'/'+filename)
    #Since I don't see the dependency listed, I grab the CSV using the boto3 resource
    try:
        csv = s3.Object(BUCKET,file)
    except botocore.exceptions.ClientError as e:
        #An example of a better way to handle logging an exception could be sending A SNS notification
            print('error grabbing ' + file)
            print(e)

    return pd.read_csv(csv.get()['Body'])

In [2]:
#Just some code to download the csv's so I can check them out on my machine
download_csv(files)

In [41]:
#load each csv into a dataframe
bengals = get_dataframe(files[0])
boyd = get_dataframe(files[1])
chase = get_dataframe(files[2])
higgins = get_dataframe(files[3])

In [42]:
#rename the columns to avoid issues when joining
boyd = boyd.rename(columns={col:col+'_Boyd' for col in boyd.columns if col != 'Week'})
chase = chase.rename(columns={col:col+'_Chase' for col in chase.columns if col != 'Week'})
higgins = higgins.rename(columns={col:col+'_Higgins' for col in higgins.columns if col != 'Week'})

#display DF's
display(boyd)
display(chase)
display(higgins)

Unnamed: 0,Week,Yards_Boyd,TD_Boyd
0,REG1,32,0
1,REG2,73,0
2,REG3,36,1
3,REG4,118,0
4,REG5,24,0
5,REG6,7,0
6,REG7,39,0
7,REG8,69,1
8,REG9,11,0
9,REG11,49,0


Unnamed: 0,Week,Yards_Chase,TD_Chase
0,REG1,101,1
1,REG2,54,1
2,REG3,65,2
3,REG4,77,0
4,REG5,159,1
5,REG6,97,0
6,REG7,201,1
7,REG8,32,1
8,REG9,49,0
9,REG11,32,1


Unnamed: 0,Week,Yards_Higgins,TD_Higgins
0,REG1,58,1
1,REG2,60,1
2,REG5,32,0
3,REG6,44,0
4,REG7,62,0
5,REG8,97,0
6,REG9,78,0
7,REG11,15,0
8,REG12,114,1
9,REG13,138,1


In [43]:
#group the recievers
receivers = [boyd,chase,higgins]
#set the index as week for all the DF
receivers = [df.set_index('Week') for df in receivers]
#join them together with the first df
merged_wr = receivers[0].join(receivers[1:])

#Sort columns so TD's and Yard's are grouped together
merged_wr.sort_index(axis=1, inplace=True)

display(merged_wr)

Unnamed: 0_level_0,TD_Boyd,TD_Chase,TD_Higgins,Yards_Boyd,Yards_Chase,Yards_Higgins
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
REG1,0.0,1,1.0,32.0,101,58.0
REG2,0.0,1,1.0,73.0,54,60.0
REG3,1.0,2,,36.0,65,
REG4,0.0,0,,118.0,77,
REG5,0.0,1,0.0,24.0,159,32.0
REG6,0.0,0,0.0,7.0,97,44.0
REG7,0.0,1,0.0,39.0,201,62.0
REG8,1.0,1,0.0,69.0,32,97.0
REG9,0.0,0,0.0,11.0,49,78.0
REG11,0.0,1,0.0,49.0,32,15.0


In [44]:
#Now lets deal with the bengals csv
#set the index for the df to Week
bengals = bengals.set_index('Week')

final_df = bengals.join(merged_wr, how='outer')

final_df.sort_index(axis=0, inplace=True)

display(final_df)

Unnamed: 0_level_0,Opponent,Location,Result,TD_Boyd,TD_Chase,TD_Higgins,Yards_Boyd,Yards_Chase,Yards_Higgins
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
POST1,LV,Home,1.0,1.0,0.0,0.0,26.0,116.0,10.0
POST2,TEN,Away,1.0,0.0,0.0,0.0,17.0,109.0,96.0
POST3,KC,Away,1.0,0.0,1.0,0.0,19.0,54.0,103.0
POST4,LAR,Neutral,0.0,0.0,0.0,2.0,48.0,89.0,100.0
PRE1,TB,Away,1.0,,,,,,
PRE2,WSH,Away,0.0,,,,,,
PRE3,MIA,Home,0.0,,,,,,
REG1,MIN,Home,1.0,0.0,1.0,1.0,32.0,101.0,58.0
REG10,,,,,,,,,
REG11,LV,Away,1.0,0.0,1.0,0.0,49.0,32.0,15.0


In [48]:
#Map new values to the result column
final_df['Result'] = final_df['Result'].map({1:'Win', 0:'Lose'})

display(final_df)

Unnamed: 0_level_0,Opponent,Location,Result,TD_Boyd,TD_Chase,TD_Higgins,Yards_Boyd,Yards_Chase,Yards_Higgins
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
POST1,LV,Home,Win,1.0,0.0,0.0,26.0,116.0,10.0
POST2,TEN,Away,Win,0.0,0.0,0.0,17.0,109.0,96.0
POST3,KC,Away,Win,0.0,1.0,0.0,19.0,54.0,103.0
POST4,LAR,Neutral,Lose,0.0,0.0,2.0,48.0,89.0,100.0
PRE1,TB,Away,Win,,,,,,
PRE2,WSH,Away,Lose,,,,,,
PRE3,MIA,Home,Lose,,,,,,
REG1,MIN,Home,Win,0.0,1.0,1.0,32.0,101.0,58.0
REG10,,,,,,,,,
REG11,LV,Away,Win,0.0,1.0,0.0,49.0,32.0,15.0
