In [1]:
# Import libaries
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Step 1: Create a soup object from the home page

In [2]:
# Set the url of the webpage to scrape
url = "https://pages.git.generalassemb.ly/rldaggie/for-scraping/"
# Generate a response
response = requests.get(url)
# Print response, if response = 200, all is good
print(response.status_code)
# Pull the HTML string out of requests and convert it to a Python string.
html = response.text

200


In [3]:
# Create BeautifulSoup Object
soup = BeautifulSoup(html, 'lxml')

### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [4]:
# Define result list
results_list = []
# Gather table data from main page
main_pg_td = soup.find_all("td")
# Gather title of restaurant and link
for element in main_pg_td:
    # start a dictionary to store this item's data
    result = {}
    # get the title and full link/url
    a_href = element.find('a')
    if a_href:
        result['restaurant'] = a_href.text   # element text
        result['link'] = a_href['href'] # href link
        results_list.append(result)
# results_list is the data with resturant name and link to it
len(results_list)

44

In [5]:
results_list

[{'restaurant': 'A&W Restaurants', 'link': 'restaurants/1.html'},
 {'restaurant': "Applebee's", 'link': 'restaurants/2.html'},
 {'restaurant': "Arby's", 'link': 'restaurants/3.html'},
 {'restaurant': 'Atlanta Bread Company', 'link': 'restaurants/4.html'},
 {'restaurant': "Bojangle's Famous Chicken 'n Biscuits",
  'link': 'restaurants/5.html'},
 {'restaurant': 'Buffalo Wild Wings', 'link': 'restaurants/6.html'},
 {'restaurant': 'Burger King', 'link': 'restaurants/7.html'},
 {'restaurant': "Captain D's", 'link': 'restaurants/8.html'},
 {'restaurant': "Carl's Jr.", 'link': 'restaurants/9.html'},
 {'restaurant': "Charley's Grilled Subs", 'link': 'restaurants/10.html'},
 {'restaurant': 'Chick-fil-A', 'link': 'restaurants/11.html'},
 {'restaurant': "Chili's", 'link': 'restaurants/12.html'},
 {'restaurant': 'Chipotle Mexican Grill', 'link': 'restaurants/13.html'},
 {'restaurant': "Church's", 'link': 'restaurants/14.html'},
 {'restaurant': 'Corner Bakery Cafe', 'link': 'restaurants/15.html'},


### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [9]:
def find_heading(pg_soup):
    '''Find the table headings of a page
    Arg:
        pg_soup: soup object of that page
    Return:
        res_pg_th_lst: result of page table head in a list
    '''
    res_pg_th_lst = []
    # Gather table heading data from main page
    res_pg_th = pg_soup.find_all("th")
    for th in res_pg_th:
        res_pg_th_lst.append(th.text.lower())
    return res_pg_th_lst

In [10]:
def find_table_content(pg_soup, res_pg_th_lst, pg_name, food_lst):
    '''Find the contents in the table and update food_lst
    Args:
        pg_soup: soup object of that page
        res_pg_th_lst: result of page table head in a list
        pg_name: name of the page
        food_lst: existing data list
    Return:
        food_lst: updated food_lst
    '''
    # Find the table
    res_pg_table = pg_soup.find("table", class_="table")
    # Find the rows in table body
    for row in res_pg_table.tbody.find_all('tr'):    
        # Find all data entry for each row
        column = row.find_all('td')
        if (column != []):
            # dict comprehension to tie th and td
            col_dict = {th: td.text.strip() for th, td in zip(res_pg_th_lst, column)}
            # insert name of restaurant
            col_dict['restaurant']=pg_name
            food_lst.append(col_dict)
    return food_lst

In [11]:
food_lst = []
for res in results_list:
    pg_link = res['link']
    pg_name = res['restaurant']
    # Create soup obj for each page
    pg_response = requests.get(url+pg_link)
    pg_html = pg_response.text
    pg_soup = BeautifulSoup(pg_html, 'lxml')
    # for each page, find headings
    res_pg_th_lst = find_heading(pg_soup)
    # for each page, update food_lst
    food_lst = find_table_content(pg_soup,res_pg_th_lst,pg_name,food_lst)

In [12]:
len(food_lst)

5131

### Step 4: Create a pandas DataFrame from your list of foods

**Note**: Your DataFrame should have 5,131 rows

In [14]:
df = pd.DataFrame.from_records(food_lst,index=list(range(0,5131)))

In [15]:
df.head()

Unnamed: 0,name,category,calories,fat,carbs,restaurant
0,Original Bacon Double Cheeseburger,Burgers,760,45,45,A&W Restaurants
1,Coney (Chili) Dog,Entrees,340,20,26,A&W Restaurants
2,Chili Fries,French Fries,370,15,49,A&W Restaurants
3,Strawberry Milkshake (small),Shakes,670,29,90,A&W Restaurants
4,A&WÂ® Root Beer Freeze (large),Shakes,820,18,150,A&W Restaurants


### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame

In [19]:
save_path = './scrapped_result.csv'

In [20]:
df.to_csv(save_path,index=False)