## Web Scraping and Writing to Google Sheets
    1. Get the data and parse with requests, lxml and Beatiful Soup 
    2. Data wrange into dictionaries 
    3. Create Pandas DF and clean the data
    4. Write to Google Sheets 

In [1]:
from requests import get 
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup 

In [2]:
#imputing URL and creating header creds for parsing 
url = 'http://www.nfl.com/stats/weeklyleaders?week=15&season=2019&showCategory=Passing'
my_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'}


In [3]:
#grabbing the HTML and getting text 
fantasy_page = get(url, headers = my_header)

doc = lh.fromstring(fantasy_page.content)

In [4]:
print(fantasy_page)

<Response [200]>


Response == 200 means the page scraped successfully 

In [6]:
#parsing table elements in the HTML inside the pattern "//tr" --> this is a table element 

table_elements = doc.xpath('//tr')

In [None]:
[len(T) for T in table_elements[:-1]]

In [35]:
table_elements[0].iterchildren

<bound method _Element.iterchildren of <Element tr at 0x13e36043688>>

In [36]:
colnames = doc.xpath('//tbody')

In [39]:
colnames[0].text_content()

'\n\n\r\n\tJameis Winston\r\n\n\r\n\tTB\r\n\n\r\n@ DET\r\n\n\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n\xa038-17\r\n\r\n\t\r\n \r\n\n28\n42\n458\n4\n1\n2\n0\n124.9\n\n\r\n\tPatrick Mahomes\r\n\n\r\n\tKC\r\n\n\r\nvs DEN\r\n\n\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n\xa023-3\r\n\r\n\t\r\n \r\n\n27\n34\n340\n2\n1\n3\n0\n115.7\n\n\r\n\tMitchell Trubisky\r\n\n\r\n\tCHI\r\n\n\r\n@ GB\r\n\n\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n\xa013-21\r\n\r\n\t\r\n \r\n\n29\n53\n334\n1\n2\n3\n0\n64.5\n\n\r\n\tDrew Brees\r\n\n\r\n\tNO\r\n\n\r\nvs IND\r\n\n\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n\xa034-7\r\n\r\n\t\r\n \r\n\n29\n30\n307\n4\n0\n0\n0\n148.9\n\n\r\n\tPhilip Rivers\r\n\n\r\n\tLAC\r\n\n\r\nvs MIN\r\n\n\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n\xa010-39\r\n\r\n\t\r\n \r\n\n28\n39\n307\n1\n3\n3\n1\n71.2\n\n\r\n\tRussell Wilson\r\n\n\r\n\tSEA\r\n\n\r\n@ CAR\r\n\n\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n\xa030-24\r\n\r\n\t\r\n \r\n\n20\n26\n286\n2\n0\n2\n0\n137.7\n\n\r\n\tJared Goff\r\n\n\r\n\tLA\r\n\n\r\n@ DAL\r\n\n\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n\xa021-44\r\n

In [9]:
#creating an empty array to insert the table elements 
cols = []
i = 0 #setting the increment 

for t in table_elements[1]:
    i+1
    name = t.text_content() #getting the column name from the HTML table 
    #print('%d:"%s"'% (i, name))
    cols.append((name, [])); 

In [10]:
table_elements[1].text_content()

'\n\r\n\tJameis Winston\r\n\n\r\n\tTB\r\n\n\r\n@ DET\r\n\n\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n\xa038-17\r\n\r\n\t\r\n \r\n\n28\n42\n458\n4\n1\n2\n0\n124.9'

In [12]:
cols[0]

('\r\n\tJameis Winston\r\n', [])

In [13]:
#Since out first row is the header, data is stored on the second row onwards

for j in range(1,len(table_elements)):
    #T is our j'th row
    T=table_elements[j]
    
    #If row is not of size 24, the //tr data is not from our table 
    if len(T)!=12:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 

        #Append the data to the empty list of the i'th column
        cols[i][1].append(data)
        #Increment i for the next column
        i+=1

In [16]:
cols = cols[0:13]

In [19]:
#creating a dictionary for the columns in the parsed table 
Dict={title:column for (title,column) in cols}

df=pd.DataFrame(Dict)

In [20]:
df

Unnamed: 0,\r\n\tJameis Winston\r\n,\r\n\tTB\r\n,\r\n@ DET\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 38-17\r\n\r\n\t\r\n \r\n,28,42,458,4,1,2,0,124.9
0,\r\n\tJameis Winston\r\n,\r\n\tTB\r\n,\r\n@ DET\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 38-17\r\n\r\n\...,28,42,458,4,1,2,0,124.9
1,\r\n\tPatrick Mahomes\r\n,\r\n\tKC\r\n,\r\nvs DEN\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 23-3\r\n\r\n\t...,27,34,340,2,1,3,0,115.7
2,\r\n\tMitchell Trubisky\r\n,\r\n\tCHI\r\n,\r\n@ GB\r\n,\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n 13-21\r\n\r\n\...,29,53,334,1,2,3,0,64.5
3,\r\n\tDrew Brees\r\n,\r\n\tNO\r\n,\r\nvs IND\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 34-7\r\n\r\n\t...,29,30,307,4,0,0,0,148.9
4,\r\n\tPhilip Rivers\r\n,\r\n\tLAC\r\n,\r\nvs MIN\r\n,\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n 10-39\r\n\r\n\...,28,39,307,1,3,3,1,71.2
5,\r\n\tRussell Wilson\r\n,\r\n\tSEA\r\n,\r\n@ CAR\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 30-24\r\n\r\n\...,20,26,286,2,0,2,0,137.7
6,\r\n\tJared Goff\r\n,\r\n\tLA\r\n,\r\n@ DAL\r\n,\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n 21-44\r\n\r\n\...,33,51,284,2,1,2,0,84.1
7,\r\n\tEli Manning\r\n,\r\n\tNYG\r\n,\r\nvs MIA\r\n,\r\n\r\n\tW\r\n\t\r\n\t\r\n\r\n 36-20\r\n\r\n\...,20,28,283,2,3,1,0,87.9
8,\r\n\tRyan Fitzpatrick\r\n,\r\n\tMIA\r\n,\r\n@ NYG\r\n,\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n 20-36\r\n\r\n\...,23,41,279,2,0,3,1,93.4
9,\r\n\tRyan Tannehill\r\n,\r\n\tTEN\r\n,\r\nvs HOU\r\n,\r\n\r\n\t\r\n\tL\r\n\t\r\n\r\n 21-24\r\n\r\n\...,22,36,279,2,1,2,0,92.2


### Light data cleaning 
    1. Dropping the "Rk" row because it is empty 
    2. Dropping a blank column used for formatting in the HTML 

In [27]:
escapes = ''.join([chr(char) for char in range(1, 32)])

translator = str.maketrans('', '', escapes)

t = df.iloc[0,0]

In [32]:
df.iloc[:, 0].str.translate(translator)

0          Jameis Winston
1         Patrick Mahomes
2       Mitchell Trubisky
3              Drew Brees
4           Philip Rivers
5          Russell Wilson
6              Jared Goff
7             Eli Manning
8        Ryan Fitzpatrick
9          Ryan Tannehill
10             Kyle Allen
11             Derek Carr
12           Carson Wentz
13         Dwayne Haskins
14           David Blough
15         Baker Mayfield
16         Deshaun Watson
17           Kyler Murray
18            Sam Darnold
19          Lamar Jackson
20           Dak Prescott
21              Matt Ryan
22              Drew Lock
23           Kirk Cousins
24          Aaron Rodgers
25          Devlin Hodges
26        Gardner Minshew
27        Jimmy Garoppolo
28        Jacoby Brissett
29            Andy Dalton
30             Josh Allen
31              Tom Brady
32          Johnny Hekker
33            Alex Tanney
34        Dustin Colquitt
35            Josh Gordon
36         Robert Griffin
37             Brett Kern
38    Christ

In [28]:
t.translate(translator)

'Jameis Winston'

In [14]:
df = df.drop(df.loc[df["Rk"] == 'Rk'].index)
df = df.drop('', 1)

In [15]:
#Grapping Parameters for looping 
n_rows = df.shape[0]
n_cols = df.shape[1]

In [26]:
#import library
import gspread 
#Service client credential from oauth2client
from oauth2client.service_account import ServiceAccountCredentials
# Print nicely
import pprint
#Create scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
#create some credential using that scope and content of startup_funding.json
creds = ServiceAccountCredentials.from_json_keyfile_name('quickstart/g_sheet_creds.json',scope)
#create gspread authorize using that credential
client = gspread.authorize(creds)

In [24]:
# creating a new sheet to drop the data 
#new_sheet = client.create('target_sheet')

#new_sheet.share('players@fantasy-data-build.iam.gserviceaccount.com', perm_type='user', role='writer')

In [25]:
#sheet = client.open('target_sheet')

#ws = sheet.get_worksheet(0)

In [28]:
#Now will can access our google sheets we call client.open on StartupName
sheet = client.open_by_url('https://docs.google.com/spreadsheets/d/11D57-HeQYujW7vZtN0MXsGpyNgWj45Q87BPiAD9CNCk/edit#gid=0') #2019-q4_fantasy-web-scraping/passing

ws = sheet.get_worksheet(0)

#Access all of the record inside that
#result = sheet.get_all_record()

shaped_data = df.transpose()

ws.insert_row(df.columns.tolist(), 1)

for i in range(1, n_rows): 
    row = df.iloc[i-1].tolist()
    index = i+1
    ws.insert_row(row, index)
    print(i)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
