## Web Scraping and Writing to Google Sheets
    1. Get the data and parse with requests, lxml and Beatiful Soup 
    2. Data wrange into dictionaries 
    3. Create Pandas DF and clean the data
    4. Write to Google Sheets 

#### Loading scraping packages and getting pages 

In [1]:
from requests import get 
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup 

#### Importing google sheets writing packages and declaring credentials 

In [4]:
#import library
import gspread 
#Service client credential from oauth2client
from oauth2client.service_account import ServiceAccountCredentials
# Print nicely
import pprint
#Create scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
#create some credential using that scope and content of startup_funding.json
creds = ServiceAccountCredentials.from_json_keyfile_name('quickstart/g_sheet_creds.json',scope)
#create gspread authorize using that credential
client = gspread.authorize(creds)


creating list of URLs for scraping and writing, and loading helper function 

In [5]:
# [passing, recieving, rushing]
source_urls = ['http://pfref.com/tiny/WZEUG', 'http://pfref.com/tiny/LsMhB', 'http://pfref.com/tiny/Nik5u']

sheet_names = ['passing', 'recieving', 'rushing']

my_email = 'matthewjchristy66@gmail.com'

In [7]:
# %load scrape_fcn2.py 

def scraper(page_url, sheet_name, share_mail): 
    
    my_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'}
    
    #grabbing the HTML and getting text 
    fantasy_page = get(page_url, headers = my_header)

    doc = lh.fromstring(fantasy_page.content)
    
    print(fantasy_page)
    
    #parsing table elements in the HTML inside the pattern "//tr" --> this is a table element 

    table_elements = doc.xpath('//tr')
    
    #creating an empty array to insert the table elements 
    cols = []
    i = 0 #setting the increment 

    for t in table_elements[1]:
        i+1
        name = t.text_content() #getting the column name from the HTML table 
        #print('%d:"%s"'% (i, name))
        cols.append((name, [])); 
        
        #Since out first row is the header, data is stored on the second row onwards

    for j in range(1,len(table_elements)):
        #T is our j'th row
        T=table_elements[j]

        #If row is not of size 24, the //tr data is not from our table 
        if len(T)!=len(table_elements[1]):
            break

        #i is the index of our column
        i=0

        #Iterate through each element of the row
        for t in T.iterchildren():
            data=t.text_content() 

            #Append the data to the empty list of the i'th column
            cols[i][1].append(data)
            #Increment i for the next column
            i+=1
        
    #creating a dictionary for the columns in the parsed table 
    Dict={title:column for (title,column) in cols}

    df=pd.DataFrame(Dict)
    
    #data cleaning 
    df = df.drop(df.loc[df["Rk"] == 'Rk'].index)
    df = df.drop('', 1)
    
    #Grapping Parameters for looping 
    n_rows = df.shape[0]
    n_cols = df.shape[1]
    
    #writing to google sheets 
    import time 

    #Now will can access our google sheets we call client.open on StartupName
    sheet = client.create(sheet_name) #2019-q4_fantasy-web-scraping/passing

    sh.share(my_mail,  perm_type='user', role='writer') #sharing my email 
    
    #writing data to the worksheet
    ws = sheet.get_worksheet(0)

    shaped_data = df.transpose()

    ws.insert_row(df.columns.tolist(), 1)

    for i in range(1, n_rows+1): 
        row = df.iloc[i-1].tolist()
        index = i+1
        if i%10 == 0: #printing the step in the loop
            print(i)  
            time.sleep(15)
            
        ws.insert_row(row, index) #writing the data 
    
    print('row ', i, ' end of file')
    time.sleep(45)

### Looping throught the pages and writing to google sheets

In [6]:
for j in range(0,len(source_urls)):

    scraper(page_url = source_urls[j], sheet_name = sheet_name[j], share_mail = my_email)
    
    print(source_urls[j], 'url', j+1)
    

<Response [200]>
10
20
30
row 36  end of file
http://pfref.com/tiny/WZEUG url 1
<Response [200]>
10
20
30
40
50
60


APIError: {
  "error": {
    "code": 429,
    "message": "Quota exceeded for quota group 'WriteGroup' and limit 'Write requests per user per 100 seconds' of service 'sheets.googleapis.com' for consumer 'project_number:686366856477'.",
    "status": "RESOURCE_EXHAUSTED",
    "details": [
      {
        "@type": "type.googleapis.com/google.rpc.Help",
        "links": [
          {
            "description": "Google developer console API key",
            "url": "https://console.developers.google.com/project/686366856477/apiui/credential"
          }
        ]
      }
    ]
  }
}
