## Web Scraping and Writing to Google Sheets
    1. Get the data and parse with requests, lxml and Beatiful Soup 
    2. Data wrange into dictionaries 
    3. Create Pandas DF and clean the data
    4. Write to Google Sheets 

#### Loading scraping packages and getting pages 

In [52]:
from requests import get 
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup 

#### Importing google sheets writing packages and declaring credentials 

In [53]:
#import library
import gspread 
#Service client credential from oauth2client
from oauth2client.service_account import ServiceAccountCredentials
# Print nicely
import pprint
#Create scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
#create some credential using that scope and content of startup_funding.json
creds = ServiceAccountCredentials.from_json_keyfile_name('quickstart/g_sheet_creds.json',scope)
#create gspread authorize using that credential
client = gspread.authorize(creds)


#### Creating list of URLs for scraping and writing, and loading helper function 

In [54]:
base_url = 'http://www.nfl.com/stats/weeklyleaders'

weeks = [x for x in range(1,17)]

stat_cat = ['Passing', 'Rushing', 'Receiving']

urls = []

names =[]

for i in weeks:
    for j in stat_cat:
        full_url = base_url + '?week=' + str(i) + '&season=2019' + '&showCategory=' + j
        urls.append(full_url)
        name = '2019_' + 'week_' + str(i) + '_' + j
        names.append(name)

#### Adding validation with pydantic 

In [57]:
from pydantic import BaseModel, HttpUrl

class url_model(BaseModel):
    url: HttpUrl

class nfl_query_model(BaseModel):
    week: int
    stat_category: str
        


In [58]:
#runing the validation on urls 
for i in range(0, len(urls)):
    url_model(url = urls[i])

In [59]:
#running validation on query 
for i in range(0, 17):
    for j in stat_cat:
        nfl_query_model(week = i, stat_category = j)

#### Scraping NFL.com 

In [60]:
# %load scrape_helper.py 

def scraper(page_url, sheet_name, share_mail): 
    
    my_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'}
    
    #grabbing the HTML and getting text 
    fantasy_page = get(page_url, headers = my_header)

    doc = lh.fromstring(fantasy_page.content)
    
    print(fantasy_page)
    
    #parsing table elements in the HTML inside the pattern "//tr" --> this is a table element 

    table_elements = doc.xpath('//tr')
    
    #getting column names 
    title = doc.xpath('//tr//th')
    
    colnames = []

    n = len(title)

    for i in range(0, n):
        name = title[i].text_content()
        colnames.append(name)
            
  #creating an empty array to insert the table elements 
    cols = []

    i = 0 #setting the increment 

    for j in range(0, len(colnames)):
        i+1
        name = colnames[j] #getting the column name from the HTML table 
        #print('%d:"%s"'% (i, name))
        cols.append((name, []))

   #Since out first row is the header, data is stored on the second row onwards

    for j in range(1,len(table_elements)):
        #T is our j'th row
        T=table_elements[j]

        #If row is not of size 24, the //tr data is not from our table 
        if len(T)!=12:
            break

        #i is the index of our column
        i=0

        #Iterate through each element of the row
        for t in T.iterchildren():
            data=t.text_content() 

            #Append the data to the empty list of the i'th column
            cols[i][1].append(data)
            #Increment i for the next column
            i+=1
        
    #creating a dictionary for the columns in the parsed table 
    Dict={title:column for (title,column) in cols}

    df=pd.DataFrame(Dict)
    
    #data cleaning 
    escapes = ''.join([chr(char) for char in range(1, 32)])
    translator = str.maketrans('', '', escapes)
    df.columns = df.columns.str.translate(translator)
    
    #fixing escape charaters
    fixed = ['Name', 'Team', 'Opp', 'Score']
    for i in fixed:
        df.loc[:, i] = df.loc[:, i].astype(str).str.translate(translator)
        
    #Grapping Parameters for looping 
    n_rows = df.shape[0]
    n_cols = df.shape[1]
    
    #writing to google sheets 
    import time 

    #Now will can access our google sheets we call client.open on StartupName
    sheet = client.create(sheet_name) #2019-q4_fantasy-web-scraping/passing

    sheet.share(share_mail,  perm_type='user', role='writer') #sharing my email 
    
    #writing data to the worksheet
    ws = sheet.get_worksheet(0)

    shaped_data = df.transpose()

    ws.insert_row(df.columns.tolist(), 1)

    for i in range(1, n_rows+1): 
        row = df.iloc[i-1].tolist()
        index = i+1
        if i%10 == 0: #printing the step in the loop
            print(i)  
            time.sleep(15)
            
        ws.insert_row(row, index) #writing the data 
    
    print('row ', i, ' end of file')
    time.sleep(45)

### Looping throught the pages and writing to google sheets

In [None]:
for j in range(0,len(urls)):

    scraper(page_url = urls[j], sheet_name = names[j], share_mail = 'matthewjchristy66@gmail.com')
    
    print(names[j])
    

<Response [200]>
10
20
30
row  38  end of file
2019_week_1_Passing
<Response [200]>
row  Score  end of file
2019_week_1_Rushing
<Response [200]>
row  Score  end of file
2019_week_1_Receiving
<Response [200]>
10
20
30
row  38  end of file
2019_week_2_Passing
<Response [200]>
row  Score  end of file
2019_week_2_Rushing
<Response [200]>
row  Score  end of file
2019_week_2_Receiving
<Response [200]>
10
20
30
row  34  end of file
2019_week_3_Passing
<Response [200]>
row  Score  end of file
2019_week_3_Rushing
<Response [200]>
row  Score  end of file
2019_week_3_Receiving
<Response [200]>
10
20
30
row  38  end of file
2019_week_4_Passing
<Response [200]>
row  Score  end of file
2019_week_4_Rushing
<Response [200]>
row  Score  end of file
2019_week_4_Receiving
<Response [200]>
10
20
30
row  39  end of file
2019_week_5_Passing
<Response [200]>
row  Score  end of file
2019_week_5_Rushing
<Response [200]>
row  Score  end of file
2019_week_5_Receiving
<Response [200]>
10
20
30
row  31  end of fil

In [50]:


scraper(page_url = urls[0], sheet_name = 'test_sheet', share_mail = 'matthewjchristy66@gmail.com')
    

    

<Response [200]>
10
20
30
row  38  end of file
