### Breaking Scraping and Writing into two steps 

In [1]:
from requests import get 
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup 

### Getting source urls 

In [1]:
base_url = 'http://www.nfl.com/stats/weeklyleaders'

weeks = [x for x in range(1,18)]

stat_cat = ['Passing', 'Rushing', 'Receiving']

urls = []

names =[]

for i in weeks:
    for j in stat_cat:
        full_url = base_url + '?week=' + str(i) + '&season=2019' + '&showCategory=' + j
        urls.append(full_url)
        name = '2019_' + 'week_' + str(i) + '_' + j
        names.append(name)

In [3]:
def get_url(url):
    #grabbing the HTML and getting text 
    my_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'}
    fantasy_page = get(url, headers = my_header)
    
    return(fantasy_page)

In [4]:
def get_headers(html):
    
    doc = lh.fromstring(html.content)
    
    #getting column names 
    title = doc.xpath('//tr//th')
    
    colnames = []

    n = len(title)

    for i in range(0, n):
        name = title[i].text_content()
        colnames.append(name)
    
    return(colnames)     

In [5]:
def get_table_data(html, colnames):
     
    #parsing table elements in the HTML inside the pattern "//tr" --> this is a table element 
    doc = lh.fromstring(html.content)
    table_elements = doc.xpath('//tr')
    
    #creating an empty array to insert the table elements 
    cols = []

    for j in range(0, len(colnames)):
        name = colnames[j] #getting the column name from the HTML table 
        #print('%d:"%s"'% (i, name))
        cols.append((name, []))

   #Since out first row is the header, data is stored on the second row onwards

    for j in range(1,len(table_elements)):
        #T is our j'th row
        T=table_elements[j]

        #If row is not of size 24, the //tr data is not from our table 
        if len(T)!=len(colnames):
            break

        #i is the index of our column
        i=0

        #Iterate through each element of the row
        for t in T.iterchildren():
            data=t.text_content() 

            #Append the data to the empty list of the i'th column
            cols[i][1].append(data)
            #Increment i for the next column
            i+=1
    return(cols)

In [6]:
def create_df(cols):
    #creating a dictionary for the columns in the parsed table 
    Dict={title:column for (title,column) in cols}

    df=pd.DataFrame(Dict)
    return(df)

In [7]:
def clean_df(df, page_url):
    #data cleaning 
    escapes = ''.join([chr(char) for char in range(1, 32)])
    translator = str.maketrans('', '', escapes)
    df.columns = df.columns.str.translate(translator)
    
    #fixing escape charaters
    fixed = ['Name', 'Team', 'Opp', 'Score']
    for i in fixed:
        df.loc[:, i] = df.loc[:, i].astype(str).str.translate(translator)
    import re 
    week = re.search('week=(.*)&season', urls[0]).group(1)
     
    df.insert(1, 'Week', week)
       
    #returning the df
    return(df)

In [16]:
def writer(data, sheet_name, share_email):
    #Grabbing Parameters for looping 
    n_rows = data.shape[0]
    n_cols = data.shape[1]
    
    #creating sheets
     #Now will can access our google sheets we call client.open on StartupName
    sheet = client.create(sheet_name) 
    sheet.share(share_email,  perm_type='user', role='writer') #sharing my email 
    
    #getting cell list to batch update
    import string
    end_col = string.ascii_uppercase[n_cols - 1]
    end_row = n_rows + 1
    
    sheet_range = 'A1:'+ end_col + str(end_row)
    
    #turning df to one long list 
    df_as_list = data.stack().tolist()
    df_as_list = data.columns.tolist() + df_as_list
    
    #getting the target sheet 
    ws = sheet.get_worksheet(0)
    cell_list = ws.range(sheet_range)
    
    #writing df list to cell range list 
    for i in range(0, len(cell_list)):
        cell_list[i].value = df_as_list[i]
        
    #batch updating 
    ws.update_cells(cell_list)

### Looping through URLs to create dataframes

In [9]:
#making empty dataframes
passing = pd.DataFrame()
receiving = pd.DataFrame()
rushing = pd.DataFrame()

In [10]:
for i in range(0, len(urls)): 
    source_html = get_url(urls[i])
    colnames = get_headers(source_html)
    table_data = get_table_data(source_html, colnames)
    raw_df = create_df(table_data)
    df = clean_df(raw_df, urls[i])
    
    if 'Passing' in urls[i]: 
        passing = pd.concat([passing, df])
    
    elif 'Receiving' in urls[i]: 
        receiving = pd.concat([receiving, df])
        
    else: 
        rushing = pd.concat([rushing, df])
    

In [11]:
print('Passing', passing.shape)

print('Receiving', receiving.shape)

print('Rushing', rushing.shape)

Passing (621, 13)
Receiving (4101, 10)
Rushing (2061, 10)


### Writing the data 

In [14]:
#import library
import gspread 
#Service client credential from oauth2client
from oauth2client.service_account import ServiceAccountCredentials
# Print nicely
import pprint
#Create scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
#create some credential using that scope and content of startup_funding.json
creds = ServiceAccountCredentials.from_json_keyfile_name('../quickstart/g_sheet_creds.json',scope)
#create gspread authorize using that credential
client = gspread.authorize(creds)
my_email = 'matthewjchristy66@gmail.com'

In [17]:
writer(data = passing, sheet_name = 'passing', share_email = my_email)

In [18]:
writer(data = receiving, sheet_name = 'receiving', share_email = my_email)

In [19]:
writer(data = rushing, sheet_name =  'rushing', share_email = my_email)