## Web Scraping and Writing to Google Sheets
    1. Get the data and parse with requests, lxml and Beatiful Soup 
    2. Data wrange into dictionaries 
    3. Create Pandas DF and clean the data
    4. Write to Google Sheets 

In [1]:
from requests import get 
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup 

In [2]:
#imputing URL and creating header creds for parsing 
url = 'http://www.nfl.com/stats/weeklyleaders?week=15&season=2019&type=REG&showCategory=Rushing'
my_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'}


In [3]:
#grabbing the HTML and getting text 
fantasy_page = get(url, headers = my_header)

doc = lh.fromstring(fantasy_page.content)

In [4]:
print(fantasy_page)

<Response [200]>


Response == 200 means the page scraped successfully 

In [5]:
#parsing table elements in the HTML inside the pattern "//tr" --> this is a table element 

table_elements = doc.xpath('//tr')

In [6]:
#[len(T) for T in table_elements[:-1]]

In [7]:
title = doc.xpath('//tr//th')

In [8]:
colnames = []

n = len(title)

for i in range(0, n):
    name = title[i].text_content()
    colnames.append(name)
    #print(i) 


In [9]:
len(colnames)

9

In [10]:
#creating an empty array to insert the table elements 
cols = []

i = 0 #setting the increment 

for j in range(0, len(colnames)):
    i+1
    name = colnames[j] #getting the column name from the HTML table 
    #print('%d:"%s"'% (i, name))
    cols.append((name, []))
    

In [11]:
#Since out first row is the header, data is stored on the second row onwards

for j in range(1,len(table_elements)):
    #T is our j'th row
    T=table_elements[j]
    
    #If row is not of size 24, the //tr data is not from our table 
    if len(T)!=len(colnames):
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 

        #Append the data to the empty list of the i'th column
        cols[i][1].append(data)
        #Increment i for the next column
        i+=1

In [58]:
#creating a dictionary for the columns in the parsed table 
Dict={title:column for (title,column) in cols}

df=pd.DataFrame(Dict)

In [59]:
df

Unnamed: 0,Name,Team,Opp,Score,\nAtt,\nYds,\nAvg,\nTD,\nFUM
0,\r\n\tKenyan Drake\r\n,\r\n\tARI\r\n,\r\nvs CLE\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 38-24\r\n\r\n\...,22,137,6.2,4,0
1,\r\n\tJoe Mixon\r\n,\r\n\tCIN\r\n,\r\nvs NE\r\n,\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n 13-34\r\n\r\n\...,25,136,5.4,0,0
2,\r\n\tChris Carson\r\n,\r\n\tSEA\r\n,\r\n@ CAR\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 30-24\r\n\r\n\...,24,133,5.5,2,0
3,\r\n\tTony Pollard\r\n,\r\n\tDAL\r\n,\r\nvs LA\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 44-21\r\n\r\n\...,12,131,10.9,1,0
4,\r\n\tNick Chubb\r\n,\r\n\tCLE\r\n,\r\n@ ARI\r\n,\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n 24-38\r\n\r\n\...,17,127,7.5,1,0
...,...,...,...,...,...,...,...,...,...
391,\r\n\tRussell Wilson\r\n,\r\n\tSEA\r\n,\r\n@ CAR\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 30-24\r\n\r\n\...,3,-1,-0.3,0,0
392,\r\n\tTom Brady\r\n,\r\n\tNE\r\n,\r\n@ CIN\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 34-13\r\n\r\n\...,2,-2,-1.0,0,0
393,\r\n\tElijhaa Penny\r\n,\r\n\tNYG\r\n,\r\nvs MIA\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 36-20\r\n\r\n\...,1,-2,-2.0,0,0
394,\r\n\tTeddy Bridgewater\r\n,\r\n\tNO\r\n,\r\nvs IND\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 34-7\r\n\r\n\t...,3,-3,-1.0,0,0


### Light data cleaning 
    1. Dropping the "Rk" row because it is empty 
    2. Dropping a blank column used for formatting in the HTML 

In [61]:
escapes = ''.join([chr(char) for char in range(1, 32)])

translator = str.maketrans('', '', escapes)

In [62]:
df.columns = df.columns.str.translate(translator)

In [63]:
fixed = ['Name', 'Team', 'Opp', 'Score']

for i in fixed:
    df.loc[:, i] = df.loc[:, i].str.translate(translator)

In [64]:
df

Unnamed: 0,Name,Team,Opp,Score,Att,Yds,Avg,TD,FUM
0,Kenyan Drake,ARI,vs CLE,W 38-24,22,137,6.2,4,0
1,Joe Mixon,CIN,vs NE,L 13-34,25,136,5.4,0,0
2,Chris Carson,SEA,@ CAR,W 30-24,24,133,5.5,2,0
3,Tony Pollard,DAL,vs LA,W 44-21,12,131,10.9,1,0
4,Nick Chubb,CLE,@ ARI,L 24-38,17,127,7.5,1,0
...,...,...,...,...,...,...,...,...,...
391,Russell Wilson,SEA,@ CAR,W 30-24,3,-1,-0.3,0,0
392,Tom Brady,NE,@ CIN,W 34-13,2,-2,-1.0,0,0
393,Elijhaa Penny,NYG,vs MIA,W 36-20,1,-2,-2.0,0,0
394,Teddy Bridgewater,NO,vs IND,W 34-7,3,-3,-1.0,0,0


#### Data Validation with Pydantic 

In [65]:
df.columns

Index(['Name', 'Team', 'Opp', 'Score', 'Att', 'Yds', 'Avg', 'TD', 'FUM'], dtype='object')

In [66]:
from pydantic import BaseModel

class scrapped_passing(BaseModel):
    Name: str
    Team: str
    Opp: str 
    Score: str 
    Comp: int 
    Att: int 
    Yds: float
    TD: int 
    Int: int
    Sck: int
    FUM: int
    Rate: float

class scrapped_recieving(BaseModel):
    Name: str
    Team: str
    Opp: str 
    Score: str
    Rec: int 
    Yds: float
    Avg: float 
    TD: int
    Fum: int 
        
class scrapped_rushing(BaseModel):
    Name: str
    Team: str
    Opp: str 
    Score: str
    Rec: int 
    Yds: float
    Avg: float 
    TD: int
    Fum: int 

In [67]:
scraped_passing(Name = df.loc[:, 'Name'],
               Team = df.loc[:, 'Team'],
               Opp = df.loc[:, 'Opp'],
               Score = df.loc[:, 'Score'],
               Comp = df.loc[:, 'Comp'],
               Att = df.loc[:, 'Att'],
               Tds = df.loc[:, 'Yds'],
               TD = df.loc[:, 'TD'],
               Int = df.loc[:, 'Int'],
               Sck = df.loc[:, 'Sck'],
               FUM = df.loc[:, 'FUM'],
               Rate = df.loc[:, 'Rate'])

KeyError: 'Comp'

In [None]:
df.dtypes

In [None]:
#Grapping Parameters for looping 
n_rows = df.shape[0]
n_cols = df.shape[1]

In [None]:
#import library
import gspread 
#Service client credential from oauth2client
from oauth2client.service_account import ServiceAccountCredentials
# Print nicely
import pprint
#Create scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
#create some credential using that scope and content of startup_funding.json
creds = ServiceAccountCredentials.from_json_keyfile_name('quickstart/g_sheet_creds.json',scope)
#create gspread authorize using that credential
client = gspread.authorize(creds)

In [None]:
# creating a new sheet to drop the data 
#new_sheet = client.create('target_sheet')

#new_sheet.share('players@fantasy-data-build.iam.gserviceaccount.com', perm_type='user', role='writer')

In [None]:
#sheet = client.open('target_sheet')

#ws = sheet.get_worksheet(0)

In [None]:
#Now will can access our google sheets we call client.open on StartupName
test_url = 'ttps://docs.google.com/spreadsheets/d/1253330Gq5L7fnSb9e7rz0ka7fjXGfe4F9I2AzUkkz3w/edit?folder=0AHsp7iQa2x1DUk9PVA#gid=0'
sheet = client.open_by_url(test_url)

ws = sheet.get_worksheet(0)

#Access all of the record inside that
#result = sheet.get_all_record()

shaped_data = df.transpose()

ws.insert_row(df.columns.tolist(), 1)

for i in range(1, n_rows): 
    row = df.iloc[i-1].tolist()
    index = i+1
    ws.insert_row(row, index)
    print(i)
