### Breaking Scraping and Writing into two steps 

In [1]:
from requests import get 
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup 

### Getting source urls 

In [2]:
base_url = 'http://www.nfl.com/stats/weeklyleaders'

weeks = [x for x in range(1,17)]

stat_cat = ['Passing', 'Rushing', 'Receiving']

urls = []

names =[]

for i in weeks:
    for j in stat_cat:
        full_url = base_url + '?week=' + str(i) + '&season=2019' + '&showCategory=' + j
        urls.append(full_url)
        name = '2019_' + 'week_' + str(i) + '_' + j
        names.append(name)

In [3]:
def scrape(page_url): 
    my_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'}
    
    #grabbing the HTML and getting text 
    fantasy_page = get(page_url, headers = my_header)

    doc = lh.fromstring(fantasy_page.content)
    
    print(fantasy_page)
    
    #parsing table elements in the HTML inside the pattern "//tr" --> this is a table element 

    table_elements = doc.xpath('//tr')
    
    #getting column names 
    title = doc.xpath('//tr//th')
    
    colnames = []

    n = len(title)

    for i in range(0, n):
        name = title[i].text_content()
        colnames.append(name)
            
  #creating an empty array to insert the table elements 
    cols = []

    i = 0 #setting the increment 

    for j in range(0, len(colnames)):
        i+1
        name = colnames[j] #getting the column name from the HTML table 
        #print('%d:"%s"'% (i, name))
        cols.append((name, []))

   #Since out first row is the header, data is stored on the second row onwards

    for j in range(1,len(table_elements)):
        #T is our j'th row
        T=table_elements[j]

        #If row is not of size 24, the //tr data is not from our table 
        if len(T)!=len(colnames):
            break

        #i is the index of our column
        i=0

        #Iterate through each element of the row
        for t in T.iterchildren():
            data=t.text_content() 

            #Append the data to the empty list of the i'th column
            cols[i][1].append(data)
            #Increment i for the next column
            i+=1
        
    #creating a dictionary for the columns in the parsed table 
    Dict={title:column for (title,column) in cols}

    df=pd.DataFrame(Dict)
    
    #data cleaning 
    escapes = ''.join([chr(char) for char in range(1, 32)])
    translator = str.maketrans('', '', escapes)
    df.columns = df.columns.str.translate(translator)
    
    #fixing escape charaters
    fixed = ['Name', 'Team', 'Opp', 'Score']
    for i in fixed:
        df.loc[:, i] = df.loc[:, i].astype(str).str.translate(translator)
        
    #returning the df
    return(df)
        

In [4]:
def writer(df, sheet_name, share_email):
    #Grapping Parameters for looping 
    n_rows = df.shape[0]
    n_cols = df.shape[1]
    
    #writing to google sheets 
    import time 

    #Now will can access our google sheets we call client.open on StartupName
    sheet = client.create(sheet_name) #2019-q4_fantasy-web-scraping/passing

    sheet.share(share_email,  perm_type='user', role='writer') #sharing my email 
    
    #writing data to the worksheet
    ws = sheet.get_worksheet(0)

    shaped_data = df.transpose()

    ws.insert_row(df.columns.tolist(), 1)

    for i in range(1, n_rows+1): 
        row = df.iloc[i-1].tolist()
        index = i+1
        if i%10 == 0: #printing the step in the loop
            print(i)  
            time.sleep(20)
            
        ws.insert_row(row, index) #writing the data 
    
    print('row ', i, ' end of file')

### Looping through URLs to create dataframes

In [5]:
#making empty dataframes

passing = pd.DataFrame()
receiving = pd.DataFrame()
rushing = pd.DataFrame()

In [6]:
for i in range(0, len(urls)): 
    df = scrape(urls[i])
    
    if 'Passing' in urls[i]: 
        passing = pd.concat([passing, df])
    
    elif 'Receiving' in urls[i]: 
        receiving = pd.concat([receiving, df])
        
    else: 
        rushing = pd.concat([rushing, df])
    

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [7]:
print('Passing', passing.shape)

print('Receiving', receiving.shape)

print('Rushing', rushing.shape)

Passing (581, 12)
Receiving (3851, 9)
Rushing (1920, 9)


In [10]:
receiving.head()

Unnamed: 0,Name,Team,Opp,Score,Rec,Yds,Avg,TD,FUM
0,Sammy Watkins,KC,@ JAX,W 40-26,9,198,22.0,3,0
1,Michael Gallup,DAL,vs NYG,W 35-17,7,158,22.6,0,0
2,John Ross,CIN,@ SEA,L 20-21,7,158,22.6,2,0
3,DeSean Jackson,PHI,vs WAS,W 32-27,8,154,19.2,2,0
4,Marquise Brown,BAL,@ MIA,W 59-10,4,147,36.8,2,0


In [12]:
rushing.head()

Unnamed: 0,Name,Team,Opp,Score,Att,Yds,Avg,TD,FUM
0,Marlon Mack,IND,@ LAC,L 24-30,25,174,7.0,1,0
1,Christian McCaffrey,CAR,vs LA,L 27-30,19,128,6.7,2,0
2,Saquon Barkley,NYG,@ DAL,L 17-35,11,120,10.9,0,0
3,Dalvin Cook,MIN,vs ATL,W 28-12,21,111,5.3,2,0
4,Mark Ingram,BAL,@ MIA,W 59-10,14,107,7.6,2,0


### Writing the data 

In [8]:
#import library
import gspread 
#Service client credential from oauth2client
from oauth2client.service_account import ServiceAccountCredentials
# Print nicely
import pprint
#Create scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
#create some credential using that scope and content of startup_funding.json
creds = ServiceAccountCredentials.from_json_keyfile_name('quickstart/g_sheet_creds.json',scope)
#create gspread authorize using that credential
client = gspread.authorize(creds)

In [9]:
my_email = 'matthewjchristy66@gmail.com'

writer(passing, 'passing', share_email = my_email)



10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
row  581  end of file


In [None]:
writer(receiving, 'receiving', share_email = my_email)

writer(rushing, 'rushing', share_email = my_email)