# Cheese Recommender

## Scraping the data

In [1]:
import pandas as pd 
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import unicodedata

Recommending cheese based on blah blah blah...  Cheese.com very lenient API

In [2]:
url = 'https://cheese.com/alphabetical/?per_page=100&i=a&page=1#top'

In [3]:
# Make request
req = requests.get(url)

In [4]:
req

<Response [200]>

In [5]:
req.url

'https://cheese.com/alphabetical/?per_page=100&i=a&page=1#top'

In [6]:
req.text



In [7]:
if 'json' in req.headers.get('Content-Type'):
    js = req.json()
else:
    print('Response content is not in JSON format.')
    js = 'spam'

Response content is not in JSON format.


In [8]:
req.headers.get('Content-Type')

'text/html; charset=utf-8'

In [9]:
soup = BeautifulSoup(req.content, 'lxml')

In [10]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<!-- Google Analytics -->
<script>
    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
    m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-36291228-1', 'auto');
    ga('send', 'pageview');
    </script>
<!-- End Google Analytics -->
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/media/img/favicon.ico" rel="icon" type="image/ico"/>
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Cheese.com - World's Greatest Cheese Resource</title>
<script>
        /**
        * Function that tracks a click on an outbound link in A

In [11]:
div = soup.find('div', {'class':'catalog internal'})
check = soup.find_all('a')

In [12]:
div

<div class="catalog internal">
<div class="grid row">
<div class="col-sm-6 col-md-4 cheese-item text-center">
<h3><a href="/abbaye-de-belloc/">Abbaye de Belloc</a></h3>
<div class="cheese-image">
<div class="cheese-image-border">
<a href="/abbaye-de-belloc/"><img alt="Abbaye de Belloc" src="/media/img/cheese-thumbs/Abbaye-de-Belloc.jpg" title="Abbaye de Belloc"/></a>
</div>
</div>
</div>
<div class="col-sm-6 col-md-4 cheese-item text-center">
<h3><a href="/abbaye-de-belval/">Abbaye de Belval</a></h3>
<div class="cheese-image">
<div class="cheese-image-border">
<a href="/abbaye-de-belval/"><img alt="Abbaye de Belval" src="/media/img/cheese-thumbs/Belval-biere.jpg" title="Abbaye de Belval"/></a>
</div>
</div>
</div>
<div class="col-sm-6 col-md-4 cheese-item text-center">
<h3><a href="/abbaye-de-citeaux/">Abbaye de Citeaux</a></h3>
<div class="cheese-image">
<div class="cheese-image-border">
<a href="/abbaye-de-citeaux/"><img alt="Abbaye de Citeaux" src="/media/img/cheese-thumbs/Abbaye-de

In [13]:
# function to strip strings of accents, from BartoszKP and oefe on StackOverflow
# https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string

import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')


In [14]:

big_chz = []
for row in div.find_all('h3')[0:]:
        
        # add each cheese name to the list, and format it (lowercase, replace whitespace with underscores, remove
        # apostrophes and accents).  
        # this list will be able to be used to look up the webpage for each cheese on cheese.com
        big_chz.append(strip_accents(row.find('a').text.lower().strip()
                                     .replace(" ", "_").replace("'", "").replace("’", "").replace("(", "").replace(")", "")))

In [15]:
big_chz[6]

'abertam'

In [16]:
## based off of Riley Dallas' code, provided in the youtube Project 3 info session video from May 2018
## https://www.youtube.com/watch?v=5Y3ZE26Ciuk
## ALSO based on Wesley Bosse and Douglas Strodtman's 'CART - Mini-demo using reddit data' DC-Flex lesson 

# make a function that scrapes a url for data.  make sure to use a beautifulsoup for url
def get_orig_chz_names(letter, pages):
    
    #set initial conditions
    names = []  #initiate a list to contain all of the posts 
    
    orig_names = []
    
    ### by switching between different lists within the url, its possible to scrape more data in one day
    url = f'https://cheese.com/alphabetical/?per_page=100&i={letter}&page={pages}#top'     


    # Perform a get requests on cheese.com
    res = requests.get(url) 
    # check to make sure not getting error before doing main quest of the code
    # 429 is error - 200 is no errors
    if res.status_code == 200:  

        
        
        soup = BeautifulSoup(res.content, 'lxml')
        
        div = soup.find('div', {'class':'catalog internal'})
        
        for row in div.find_all('h3')[0:]:
        
            # add each cheese name to the list
            names.append(row.find('a').text)
            
            orig_names.append(row.find('a').text)

    #if code getting an error, do a print message notification and break the for loop    
    else:
        print('ERROR')
        print(res.status_code)
        #break

    time.sleep(1) #sleep for 1 second in between for loops so as not to appear to be DDoS attack 

    
    return names

In [17]:
all_orig_names = []

# go through every letter in the alphabet as the cheeses are listed alphabetically on cheese.com
alph = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 
            'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# for every letter in the alphabet, there are at most 3 pages of cheeses
# note that when there are less then 3 pages for a letter, the function will read in
# the cheeses from the last existant page, which will result in scraping duplicates
page = ['1', '2', '3']

# loop through every letter in the alphabet
for let in alph:
    # loop through every page for each letter
    for p in page:
        all_orig_names += get_orig_chz_names(let, p)

# drop all duplicates by changing the list to a set (whose objects are distinct) and then back to a list
# note that .sort() is neccesary because changing to a set takes it outt of alphabetical order
all_orig_names = list(set(all_orig_names))

# note that .sort() is neccesary because changing to a set takes it outt of alphabetical order
all_orig_names.sort()

# note that .sort() will sort uppercase before lowercase, so need to use sorted(), lambda, and .casefold()
# https://stackoverflow.com/questions/10269701/case-insensitive-list-sorting-without-lowercasing-the-result
all_orig_names = sorted(all_orig_names, key=lambda s: s.casefold())

In [18]:
# note that we want this list to fix to our dataframe later on
all_orig_names

['Abbaye de Belloc',
 'Abbaye de Belval',
 'Abbaye de Citeaux',
 'Abbaye de Timadeuc',
 'Abbaye du Mont des Cats',
 'Abbot’s Gold',
 'Abertam',
 'Abondance',
 'Acapella',
 'Accasciato ',
 'Ackawi',
 'Acorn',
 'Adelost',
 'ADL Brick Cheese',
 'ADL Mild Cheddar',
 'Affidelice au Chablis',
 'Affineur Walo Rotwein Sennechäs',
 "Afuega'l Pitu",
 'Aged British Cheddar',
 'Aged Cashew & Blue Green Algae Cheese',
 'Aged Cashew & Brazil Nut Cheese',
 'Aged Cashew & Dulse Cheese',
 'Aged Cashew & Hemp Seed Cheese',
 'Aged Cashew Nut & Kale Cheese',
 'Aged Cashew Nut Cheese',
 'Aged Chelsea',
 'Aged Gouda',
 'Aggiano',
 'Ailsa Craig',
 'Airag',
 'Airedale',
 'Aisy Cendre',
 'Allgauer Emmentaler',
 'Allium Piper',
 'Alma Vorarlberger Alpkäse (3-5 months)',
 'Alma Vorarlberger Alpkäse (6-9 months)',
 'Alma Vorarlberger Bergkäse (10 months)',
 'Alma Vorarlberger Bergkäse (12 months)',
 'Alma Vorarlberger Bergkäse (6 months)',
 'Almnäs Tegel',
 'Alpe di Frabosa',
 'Alpha Tolman',
 'Alpicrème',
 'Alpi

In [19]:
len(all_orig_names)

1828

In [20]:
# add each cheese name to the list, and format it (lowercase, remove leading and ending whitespaces, 
# replace whitespaces and ampersands with hyphens, remove apostrophes, percent, parentheses and accents).  
# this list will be able to be used to look up the webpage for each cheese on cheese.com

web_friendly_names = []
for i in range(len(all_orig_names)):
    web_friendly_names.append(strip_accents(all_orig_names[i].lower().strip().replace(" ", "-").replace("'", "")
                                           .replace("’", "").replace("-&-", "-").replace("(", "").replace(")", "")
                                          .replace("%", "").replace("…", "").replace(".", "").replace(",", "")
                                          .replace("-/-", "-").replace("---", "-")))

In [21]:
web_friendly_names

['abbaye-de-belloc',
 'abbaye-de-belval',
 'abbaye-de-citeaux',
 'abbaye-de-timadeuc',
 'abbaye-du-mont-des-cats',
 'abbots-gold',
 'abertam',
 'abondance',
 'acapella',
 'accasciato',
 'ackawi',
 'acorn',
 'adelost',
 'adl-brick-cheese',
 'adl-mild-cheddar',
 'affidelice-au-chablis',
 'affineur-walo-rotwein-sennechas',
 'afuegal-pitu',
 'aged-british-cheddar',
 'aged-cashew-blue-green-algae-cheese',
 'aged-cashew-brazil-nut-cheese',
 'aged-cashew-dulse-cheese',
 'aged-cashew-hemp-seed-cheese',
 'aged-cashew-nut-kale-cheese',
 'aged-cashew-nut-cheese',
 'aged-chelsea',
 'aged-gouda',
 'aggiano',
 'ailsa-craig',
 'airag',
 'airedale',
 'aisy-cendre',
 'allgauer-emmentaler',
 'allium-piper',
 'alma-vorarlberger-alpkase-3-5-months',
 'alma-vorarlberger-alpkase-6-9-months',
 'alma-vorarlberger-bergkase-10-months',
 'alma-vorarlberger-bergkase-12-months',
 'alma-vorarlberger-bergkase-6-months',
 'almnas-tegel',
 'alpe-di-frabosa',
 'alpha-tolman',
 'alpicreme',
 'alpine-gold',
 'alpine-styl

In [22]:
len(web_friendly_names)

1828

In [23]:
# find index numbers of all of the cheese names that throw errors when entered to reference the website
# then go back and find out the web page's errors and replace the values at these indices with the 
# string values that cheese.com uses to call each webpage
names_frame = pd.DataFrame(web_friendly_names) 

print(names_frame[names_frame == 'austrian-alps-cheese'].dropna())

print(names_frame[names_frame == 'brinza-feta-style'].dropna())

print(names_frame[names_frame == 'buchette-a-la-sarriette'].dropna())

print(names_frame[names_frame == 'fromage-blanc-with-truffle'].dropna())

print(names_frame[names_frame == 'kabritt'].dropna())

print(names_frame[names_frame == 'laclare-farms-evalon-with-cummin'].dropna())

print(names_frame[names_frame == 'laclare-farms-evalon-with-fenugreek'].dropna())

print(names_frame[names_frame == 'little-bloom-on-the-prairie'].dropna())

print(names_frame[names_frame == 'lord-of-the-hundreds'].dropna())

print(names_frame[names_frame == 'purples-a-must'].dropna())

print(names_frame[names_frame == 'saltbush-chevre'].dropna())

print(names_frame[names_frame == 'wasatch-mountain-cheese'].dropna())

print(names_frame[names_frame == 'wensleydale-with-cranberries'].dropna())

print(names_frame[names_frame == 'white-stilton-with-mango-ginger'].dropna())

print(names_frame[names_frame == 'wyfe-of-bath'].dropna())

                       0
96  austrian-alps-cheese
                     0
259  brinza-feta-style
                           0
295  buchette-a-la-sarriette
                              0
682  fromage-blanc-with-truffle
           0
845  kabritt
                                    0
908  laclare-farms-evalon-with-cummin
                                       0
909  laclare-farms-evalon-with-fenugreek
                               0
973  little-bloom-on-the-prairie
                        0
989  lord-of-the-hundreds
                   0
1383  purples-a-must
                    0
1494  saltbush-chevre
                            0
1769  wasatch-mountain-cheese
                                 0
1777  wensleydale-with-cranberries
                                    0
1783  white-stilton-with-mango-ginger
                 0
1801  wyfe-of-bath


In [24]:
# replacing all of the names with the website names that were entered with errors

web_friendly_names[96] = 'austrian-alps'

web_friendly_names[259] = 'brinza---feta-style'

web_friendly_names[295] = 'buchette-la-sarriette'

web_friendly_names[682] = 'fromage-blanc-truffle'

web_friendly_names[845] = 'Kabritt'

web_friendly_names[908] = 'laclare-farms-evalon-cummin'

web_friendly_names[909] = 'laclare-farms-evalon-fenugreek'

web_friendly_names[973] = 'little-bloom-prairie'

web_friendly_names[989] = 'lord-hundreds'

web_friendly_names[1383] = 'purples-must'

web_friendly_names[1494] = 'saltbush_chevre'

web_friendly_names[1769] = 'wasatch-mountain'

web_friendly_names[1777] = 'wensleydale-cranberries'

web_friendly_names[1783] = 'white-stilton-mango-ginger'

web_friendly_names[1801] = 'wyfe-bath'

In [25]:
web_friendly_names

['abbaye-de-belloc',
 'abbaye-de-belval',
 'abbaye-de-citeaux',
 'abbaye-de-timadeuc',
 'abbaye-du-mont-des-cats',
 'abbots-gold',
 'abertam',
 'abondance',
 'acapella',
 'accasciato',
 'ackawi',
 'acorn',
 'adelost',
 'adl-brick-cheese',
 'adl-mild-cheddar',
 'affidelice-au-chablis',
 'affineur-walo-rotwein-sennechas',
 'afuegal-pitu',
 'aged-british-cheddar',
 'aged-cashew-blue-green-algae-cheese',
 'aged-cashew-brazil-nut-cheese',
 'aged-cashew-dulse-cheese',
 'aged-cashew-hemp-seed-cheese',
 'aged-cashew-nut-kale-cheese',
 'aged-cashew-nut-cheese',
 'aged-chelsea',
 'aged-gouda',
 'aggiano',
 'ailsa-craig',
 'airag',
 'airedale',
 'aisy-cendre',
 'allgauer-emmentaler',
 'allium-piper',
 'alma-vorarlberger-alpkase-3-5-months',
 'alma-vorarlberger-alpkase-6-9-months',
 'alma-vorarlberger-bergkase-10-months',
 'alma-vorarlberger-bergkase-12-months',
 'alma-vorarlberger-bergkase-6-months',
 'almnas-tegel',
 'alpe-di-frabosa',
 'alpha-tolman',
 'alpicreme',
 'alpine-gold',
 'alpine-styl

In [26]:
url2 = 'https://cheese.com/abbaye-de-belloc/'

# Make request
req = requests.get(url2)

soup = BeautifulSoup(req.content, 'lxml')

#div = soup.find('div', {'class':'col-sm-12 col-md-6'})
ul = soup.find('ul', {'class':'summary-points'})
#para = soup.find_all('p')

In [27]:
ul.find_all('p')#[0]

[<p>Made from unpasteurized <a href="/by_milk/?m=cow">cow</a>'s and <a href="/by_milk/?m=sheep">sheep</a>'s milk</p>,
 <p>Country of origin: <a href="/by_country/?c=FR">France</a></p>,
 <p>Region: Pays Basque</p>,
 <p>Type: <a href="/by_type/?t=semi-hard">semi-hard</a>, artisan</p>,
 <p>Texture: <a href="/by_texture/?t=creamy">creamy</a>, <a href="/by_texture/?t=dense">dense</a> and <a href="/by_texture/?t=firm">firm</a></p>,
 <p>Rind: natural</p>,
 <p>Colour: yellow</p>,
 <p>Flavour: burnt caramel</p>,
 <p>Aroma: lanoline</p>,
 <p>Vegetarian: yes </p>,
 <p>Producers: Abbaye de Notre-Dame de Belloc </p>,
 <p>Synonyms: Abbaye Notre-Dame de Belloc</p>]

In [28]:
posts = []
#for row in soup.find_all('li')[0:]:
cheese = {}

#ul.find_all('p')[0].text = 'Milk_type: '+ ul.find_all('p')[0].text
cheese['Milk_type'] = ul.find_all('p')[0].text
cheese['Country_of_origin'] = ul.find_all('p')[1].text
cheese['Region'] = ul.find_all('p')[2].text
cheese['Type'] = ul.find_all('p')[3].text
cheese['Texture'] = ul.find_all('p')[4].text
cheese['Rind'] = ul.find_all('p')[5].text
cheese['Color'] = ul.find_all('p')[6].text
cheese['Flavor'] = ul.find_all('p')[7].text
cheese['Aroma'] = ul.find_all('p')[8].text
cheese['Vegetarian'] = ul.find_all('p')[9].text
cheese['Producers'] = ul.find_all('p')[10].text
cheese['Synonyms'] = ul.find_all('p')[11].text

posts.append(cheese)
#posts = [cheese]
#chz_df = pd.DataFrame(posts)

In [29]:
cheese

{'Milk_type': "Made from unpasteurized cow's and sheep's milk",
 'Country_of_origin': 'Country of origin: France',
 'Region': 'Region: Pays Basque',
 'Type': 'Type: semi-hard, artisan',
 'Texture': 'Texture: creamy, dense and firm',
 'Rind': 'Rind: natural',
 'Color': 'Colour: yellow',
 'Flavor': 'Flavour: burnt caramel',
 'Aroma': 'Aroma: lanoline',
 'Vegetarian': 'Vegetarian: yes ',
 'Producers': 'Producers: Abbaye de Notre-Dame de Belloc ',
 'Synonyms': 'Synonyms: Abbaye Notre-Dame de Belloc'}

In [30]:
cheese['Milk_type'] = 'Milk_type: '+ cheese['Milk_type']

In [31]:
cheese

{'Milk_type': "Milk_type: Made from unpasteurized cow's and sheep's milk",
 'Country_of_origin': 'Country of origin: France',
 'Region': 'Region: Pays Basque',
 'Type': 'Type: semi-hard, artisan',
 'Texture': 'Texture: creamy, dense and firm',
 'Rind': 'Rind: natural',
 'Color': 'Colour: yellow',
 'Flavor': 'Flavour: burnt caramel',
 'Aroma': 'Aroma: lanoline',
 'Vegetarian': 'Vegetarian: yes ',
 'Producers': 'Producers: Abbaye de Notre-Dame de Belloc ',
 'Synonyms': 'Synonyms: Abbaye Notre-Dame de Belloc'}

In [32]:
posts = []
count = 0
for row in ul.find_all('p')[0:]:
    cheese = {}

    cheese[count] = row.text
    #posts.append(row.text)
    count+=1
#     cheese['Country_of_origin'] = row[1].text
#     cheese['Region'] = row[2].text
#     cheese['Type'] = row[3].text
#     cheese['Texture'] = row[4].text
#     cheese['Rind'] = row[5].text
#     cheese['Color'] = row[6].text
#     cheese['Flavor'] = row[7].text
#     cheese['Aroma'] = row[8].text
#     cheese['Vegetarian'] = row[9].text
#     cheese['Producers'] = row[10].text
#     cheese['Synonyms'] = row[11].text

    posts.append(cheese)
#posts = [cheese]
#chz_df = pd.DataFrame(posts)

In [33]:
posts = []
count = 0
for row in ul.find_all('p')[0:]:
    cheese = {}

    #cheese[count] = row.text
    posts.append(row.text)
    count+=1
#     cheese['Country_of_origin'] = row[1].text
#     cheese['Region'] = row[2].text
#     cheese['Type'] = row[3].text
#     cheese['Texture'] = row[4].text
#     cheese['Rind'] = row[5].text
#     cheese['Color'] = row[6].text
#     cheese['Flavor'] = row[7].text
#     cheese['Aroma'] = row[8].text
#     cheese['Vegetarian'] = row[9].text
#     cheese['Producers'] = row[10].text
#     cheese['Synonyms'] = row[11].text

    #posts.append(cheese)
#posts = [cheese]
#chz_df = pd.DataFrame(posts)

In [34]:
web_friendly_names[0:3]

['abbaye-de-belloc', 'abbaye-de-belval', 'abbaye-de-citeaux']

In [35]:
posts

["Made from unpasteurized cow's and sheep's milk",
 'Country of origin: France',
 'Region: Pays Basque',
 'Type: semi-hard, artisan',
 'Texture: creamy, dense and firm',
 'Rind: natural',
 'Colour: yellow',
 'Flavour: burnt caramel',
 'Aroma: lanoline',
 'Vegetarian: yes ',
 'Producers: Abbaye de Notre-Dame de Belloc ',
 'Synonyms: Abbaye Notre-Dame de Belloc']

In [36]:
## based off of Riley Dallas' code, provided in the youtube Project 3 info session video from May 2018
## https://www.youtube.com/watch?v=5Y3ZE26Ciuk
## ALSO based on Wesley Bosse and Douglas Strodtman's 'CART - Mini-demo using reddit data' DC-Flex lesson 

# make a function that scrapes a url for data.  make sure to use a beautifulsoup for url
def get_chz_posts(cheese_name):
    
    #set initial conditions
    posts = []  #initiate a list to contain all of the posts 
    
    ### by switching between different lists within the url, its possible to scrape more data in one day
    url = f'https://cheese.com/{cheese_name}'     


    # Perform a get requests on cheese.com
    res = requests.get(url) 
    # check to make sure not getting error before doing main quest of the code
    # 429 is error - 200 is no errors
    if res.status_code == 200:  

        
        
        soup = BeautifulSoup(res.content, 'lxml')
        
        ul = soup.find('ul', {'class':'summary-points'})
        
        count = 0
        for row in ul.find_all('p')[0:]:
            cheese = {}

            #cheese['Name'] = cheese_name
            #cheese[count] = row.text
            posts.append(row.text)
            count+=1

            

#             cheese['Milk_type'] = ul.find_all('p')[0].text
#             cheese['Country_of_origin'] = ul.find_all('p')[1].text
#             cheese['Region'] = ul.find_all('p')[2].text
#             cheese['Type'] = ul.find_all('p')[3].text
#             cheese['Texture'] = ul.find_all('p')[4].text
#             cheese['Rind'] = ul.find_all('p')[5].text
#             cheese['Color'] = ul.find_all('p')[6].text
#             cheese['Flavor'] = ul.find_all('p')[7].text
#             cheese['Aroma'] = ul.find_all('p')[8].text
#             cheese['Vegetarian'] = ul.find_all('p')[9].text
#             cheese['Producers'] = ul.find_all('p')[10].text
#             cheese['Synonyms'] = ul.find_all('p')[11].text

            #posts.append(cheese)

        
            
            
            
    #if code getting an error, do a print message notification and break the for loop    
    else:
        print('ERROR')
        print(res.status_code)
        print(url)
        #break

    #time.sleep(1) #sleep for 1 second in between for loops so as not to appear to be DDoS attack 

    #chz_df = pd.DataFrame(posts)
    return posts

In [37]:
for n in web_friendly_names:
    print(n)

abbaye-de-belloc
abbaye-de-belval
abbaye-de-citeaux
abbaye-de-timadeuc
abbaye-du-mont-des-cats
abbots-gold
abertam
abondance
acapella
accasciato
ackawi
acorn
adelost
adl-brick-cheese
adl-mild-cheddar
affidelice-au-chablis
affineur-walo-rotwein-sennechas
afuegal-pitu
aged-british-cheddar
aged-cashew-blue-green-algae-cheese
aged-cashew-brazil-nut-cheese
aged-cashew-dulse-cheese
aged-cashew-hemp-seed-cheese
aged-cashew-nut-kale-cheese
aged-cashew-nut-cheese
aged-chelsea
aged-gouda
aggiano
ailsa-craig
airag
airedale
aisy-cendre
allgauer-emmentaler
allium-piper
alma-vorarlberger-alpkase-3-5-months
alma-vorarlberger-alpkase-6-9-months
alma-vorarlberger-bergkase-10-months
alma-vorarlberger-bergkase-12-months
alma-vorarlberger-bergkase-6-months
almnas-tegel
alpe-di-frabosa
alpha-tolman
alpicreme
alpine-gold
alpine-style
alps-rebel
alta-badia
alverca
amablu-blue-cheese
amalthee
amarelo-de-beira-baixa
ambert
ameribella
american-cheese
ami-du-chambertin
amish-frolic
amou
amsterdammer-british-colu

In [38]:
get_chz_posts('brinza---feta-style')

["Made from sheep's milk",
 'Country of origin: New Zealand',
 'Region: Queenstown',
 'Family: Feta',
 'Type: soft, brined',
 'Texture: creamy, crumbly and open',
 'Rind: natural',
 'Colour: white',
 'Flavour: citrusy, salty, sweet, tangy',
 'Vegetarian:  no ',
 'Producers: The Gibbston Valley Cheese Company',
 'Synonyms: Briza Feta']

In [47]:
posts = []
#for n in all_names[0:3]:
for n in web_friendly_names:
    posts.append(get_chz_posts(n))
    #posts += get_chz_posts(n)
chz_df = pd.DataFrame(posts)

In [39]:
chz_df

Unnamed: 0,Milk_type,Country_of_origin,Region,Type,Texture,Rind,Color,Flavor,Aroma,Vegetarian,Producers,Synonyms
0,Made from unpasteurized cow's and sheep's milk,Country of origin: France,Region: Pays Basque,"Type: semi-hard, artisan","Texture: creamy, dense and firm",Rind: natural,Colour: yellow,Flavour: burnt caramel,Aroma: lanoline,Vegetarian: yes,Producers: Abbaye de Notre-Dame de Belloc,Synonyms: Abbaye Notre-Dame de Belloc


In [49]:
# save our cheese  dataframe to the file big_cheese.csv
#chz_df.to_csv('./data/big_cheese.csv', index=False)

In [40]:
#read cheese_posts.csv into 'train'
chz_df2 = pd.read_csv('./data/big_cheese.csv')

In [41]:
chz_df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Made from unpasteurized cow's and sheep's milk,Country of origin: France,Region: Pays Basque,"Type: semi-hard, artisan","Texture: creamy, dense and firm",Rind: natural,Colour: yellow,Flavour: burnt caramel,Aroma: lanoline,Vegetarian: yes,Producers: Abbaye de Notre-Dame de Belloc,Synonyms: Abbaye Notre-Dame de Belloc,,,,,
1,Made from cow's milk,Country of origin: France,Type: semi-hard,Fat content: 40-46%,Texture: elastic,Rind: washed,Colour: ivory,Aroma: aromatic,Vegetarian: no,,,,,,,,
2,Made from unpasteurized cow's milk,Country of origin: France,Region: Burgundy,"Type: semi-soft, artisan, brined","Texture: creamy, dense and smooth",Rind: washed,Colour: white,"Flavour: acidic, milky, smooth","Aroma: barnyardy, earthy",Vegetarian: no,Producers: Frères Frédéric and Joel,,,,,,
3,Made from pasteurized cow's milk,Country of origin: France,Region: province of Brittany,Type: semi-hard,Texture: soft,Rind: washed,Colour: pale yellow,Vegetarian: no,Producers: Abbaye Cistercienne NOTRE-DAME DE ...,,,,,,,,
4,Made from pasteurized cow's milk,Country of origin: France,Region: Nord-Pas-de-Calais,"Type: semi-soft, artisan, brined",Fat content: 50%,Texture: smooth and supple,Rind: washed,Colour: pale yellow,"Flavour: milky, salty",Aroma: floral,Vegetarian: no,Producers: Abbaye du Mont des Cats,,,,,
5,Made from pasteurized cow's milk,"Country of origin: England, Great Britain and ...",Region: North Yorkshire,Family: Cheddar,Type: semi-hard,"Texture: creamy, crumbly, dense and semi firm",Rind: natural,Colour: pale yellow,"Flavour: mild, sweet, tangy",Aroma: aromatic,Vegetarian: yes,Producers: Wensleydale Creamery,Synonyms: English Cheddar with Caramelized Oni...,,,,
6,Made from sheep's milk,Country of origin: Czech Republic,Region: Karlovy Vary,"Type: hard, artisan",Fat content: 45%,Texture: firm,Rind: natural,Colour: pale yellow,"Flavour: acidic, strong, tangy",Vegetarian: no,,,,,,,
7,Made from unpasteurized cow's milk,Country of origin: France,"Region: Haute-Savoie, Abondance",Family: Tomme,"Type: semi-hard, artisan",Fat content: 48%,"Texture: creamy, open and supple",Rind: natural,Colour: pale yellow,"Flavour: acidic, buttery, fruity, sweet",Aroma: nutty,Vegetarian: no,Alternative spellings: Tomme d'Abondance,,,,
8,Made from goat's milk,Country of origin: United States,Region: California,"Type: soft, soft-ripened",Flavour: buttery,"Aroma: fresh, herbal",Vegetarian: no,Producers: Andante Dairy,,,,,,,,,
9,Made from Buffalo's and cow's milk,Country of origin: Italy,Region: Campania,Type: semi-hard,Texture: firm,Rind: natural,Colour: pale yellow,Flavour: sweet,"Aroma: aromatic, fresh",Vegetarian: no,Producers: Casa Madaio,,,,,,


In [42]:
chz_df2[chz_df2.iloc[:,16].notnull()]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
729,"Made from pasteurized or unpasteurized cow's, ...",Country of origin: Netherlands,"Region: South Holland, Gouda",Family: Gouda,"Type: semi-hard, artisan, brined, processed",Fat content (in dry matter): 76%,Fat content: 31 g/100g,Calcium content: 958 mg/100g,"Texture: compact, crumbly, dense and springy",Rind: waxed,Colour: yellow,"Flavour: creamy, full-flavored, nutty, sweet",Aroma: pungent,Vegetarian: no,"Producers: FrieslandCampina, Uniekaas Nederlan...","Synonyms: Boerenkass Gouda, Graskaas Gouda, Jo...",Alternative spellings: Goudam


In [43]:
chz_df2.iloc[1826]['0']

'Country of origin: Portugal'

In [44]:
len(chz_df2)

1828

In [45]:
list(range(16))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [46]:
chz_df3.shape

NameError: name 'chz_df3' is not defined

In [47]:
chz_df3[16][1826]

NameError: name 'chz_df3' is not defined

In [48]:
chz_df2.iloc[1826][16]

nan

In [49]:
posts

["Made from unpasteurized cow's and sheep's milk",
 'Country of origin: France',
 'Region: Pays Basque',
 'Type: semi-hard, artisan',
 'Texture: creamy, dense and firm',
 'Rind: natural',
 'Colour: yellow',
 'Flavour: burnt caramel',
 'Aroma: lanoline',
 'Vegetarian: yes ',
 'Producers: Abbaye de Notre-Dame de Belloc ',
 'Synonyms: Abbaye Notre-Dame de Belloc']

In [50]:
chz_df2.fillna('empty', inplace=True)

In [51]:
chz_df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Made from unpasteurized cow's and sheep's milk,Country of origin: France,Region: Pays Basque,"Type: semi-hard, artisan","Texture: creamy, dense and firm",Rind: natural,Colour: yellow,Flavour: burnt caramel,Aroma: lanoline,Vegetarian: yes,Producers: Abbaye de Notre-Dame de Belloc,Synonyms: Abbaye Notre-Dame de Belloc,empty,empty,empty,empty,empty
1,Made from cow's milk,Country of origin: France,Type: semi-hard,Fat content: 40-46%,Texture: elastic,Rind: washed,Colour: ivory,Aroma: aromatic,Vegetarian: no,empty,empty,empty,empty,empty,empty,empty,empty
2,Made from unpasteurized cow's milk,Country of origin: France,Region: Burgundy,"Type: semi-soft, artisan, brined","Texture: creamy, dense and smooth",Rind: washed,Colour: white,"Flavour: acidic, milky, smooth","Aroma: barnyardy, earthy",Vegetarian: no,Producers: Frères Frédéric and Joel,empty,empty,empty,empty,empty,empty
3,Made from pasteurized cow's milk,Country of origin: France,Region: province of Brittany,Type: semi-hard,Texture: soft,Rind: washed,Colour: pale yellow,Vegetarian: no,Producers: Abbaye Cistercienne NOTRE-DAME DE ...,empty,empty,empty,empty,empty,empty,empty,empty
4,Made from pasteurized cow's milk,Country of origin: France,Region: Nord-Pas-de-Calais,"Type: semi-soft, artisan, brined",Fat content: 50%,Texture: smooth and supple,Rind: washed,Colour: pale yellow,"Flavour: milky, salty",Aroma: floral,Vegetarian: no,Producers: Abbaye du Mont des Cats,empty,empty,empty,empty,empty
5,Made from pasteurized cow's milk,"Country of origin: England, Great Britain and ...",Region: North Yorkshire,Family: Cheddar,Type: semi-hard,"Texture: creamy, crumbly, dense and semi firm",Rind: natural,Colour: pale yellow,"Flavour: mild, sweet, tangy",Aroma: aromatic,Vegetarian: yes,Producers: Wensleydale Creamery,Synonyms: English Cheddar with Caramelized Oni...,empty,empty,empty,empty
6,Made from sheep's milk,Country of origin: Czech Republic,Region: Karlovy Vary,"Type: hard, artisan",Fat content: 45%,Texture: firm,Rind: natural,Colour: pale yellow,"Flavour: acidic, strong, tangy",Vegetarian: no,empty,empty,empty,empty,empty,empty,empty
7,Made from unpasteurized cow's milk,Country of origin: France,"Region: Haute-Savoie, Abondance",Family: Tomme,"Type: semi-hard, artisan",Fat content: 48%,"Texture: creamy, open and supple",Rind: natural,Colour: pale yellow,"Flavour: acidic, buttery, fruity, sweet",Aroma: nutty,Vegetarian: no,Alternative spellings: Tomme d'Abondance,empty,empty,empty,empty
8,Made from goat's milk,Country of origin: United States,Region: California,"Type: soft, soft-ripened",Flavour: buttery,"Aroma: fresh, herbal",Vegetarian: no,Producers: Andante Dairy,empty,empty,empty,empty,empty,empty,empty,empty,empty
9,Made from Buffalo's and cow's milk,Country of origin: Italy,Region: Campania,Type: semi-hard,Texture: firm,Rind: natural,Colour: pale yellow,Flavour: sweet,"Aroma: aromatic, fresh",Vegetarian: no,Producers: Casa Madaio,empty,empty,empty,empty,empty,empty


In [52]:
# create a new empty dataframe with the same dimensions as the dataframe that needs to be organized
chz_df3 = pd.DataFrame(index = list(range(len(chz_df2))),columns= list((range(17))))

for i in range(len(chz_df2)):
    for k in range(16):
        if "Made from" in chz_df2.iloc[i][k]:
            chz_df3[0][i] = chz_df2.iloc[i][k].replace("Made from ", "")
        
        if "Country of origin" in chz_df2.iloc[i][k]:
            chz_df3[1][i] = chz_df2.iloc[i][k].replace("Country of origin: ", "")
        
        if "Region" in chz_df2.iloc[i][k]:
            chz_df3[2][i] = chz_df2.iloc[i][k].replace("Region: ", "")
        
        if "Family" in chz_df2.iloc[i][k]:
            chz_df3[3][i] = chz_df2.iloc[i][k].replace("Family: ", "")
        
        if "Type" in chz_df2.iloc[i][k]:
            chz_df3[4][i] = chz_df2.iloc[i][k].replace("Type: ", "")
        
        if "Fat content (in dry matter)" in chz_df2.iloc[i][k]:
            chz_df3[5][i] = chz_df2.iloc[i][k].replace("Fat content (in dry matter): ", "")
        
        if "Fat content:" in chz_df2.iloc[i][k]:
            chz_df3[6][i] = chz_df2.iloc[i][k].replace("Fat content: ", "")
        
        if "Calcium content" in chz_df2.iloc[i][k]:
            chz_df3[7][i] = chz_df2.iloc[i][k].replace("Calcium content: ", "")
        
        if "Texture" in chz_df2.iloc[i][k]:
            chz_df3[8][i] = chz_df2.iloc[i][k].replace("Texture: ", "")
        
        if "Rind" in chz_df2.iloc[i][k]:
            chz_df3[9][i] = chz_df2.iloc[i][k].replace("Rind: ", "")
        
        if "Colour" in chz_df2.iloc[i][k]:
            chz_df3[10][i] = chz_df2.iloc[i][k].replace("Colour: ", "")
        
        if "Flavour" in chz_df2.iloc[i][k]:
            chz_df3[11][i] = chz_df2.iloc[i][k].replace("Flavour: ", "")
        
        if "Aroma" in chz_df2.iloc[i][k]:
            chz_df3[12][i] = chz_df2.iloc[i][k].replace("Aroma: ", "")
        
        if "Vegetarian" in chz_df2.iloc[i][k]:
            chz_df3[13][i] = chz_df2.iloc[i][k].replace("Vegetarian: ", "")
        
        if "Producers" in chz_df2.iloc[i][k]:
            chz_df3[14][i] = chz_df2.iloc[i][k].replace("Producers: ", "")
        
        if "Synonyms" in chz_df2.iloc[i][k]:
            chz_df3[15][i] = chz_df2.iloc[i][k].replace("Synonyms: ", "")
        
        if "Alternative spellings" in chz_df2.iloc[i][k]:
            chz_df3[16][i] = chz_df2.iloc[i][k].replace("Alternative spellings: ", "")
        

In [53]:
chz_df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,unpasteurized cow's and sheep's milk,France,Pays Basque,,"semi-hard, artisan",,,,"creamy, dense and firm",natural,yellow,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye Notre-Dame de Belloc,
1,cow's milk,France,,,semi-hard,,40-46%,,elastic,washed,ivory,,aromatic,no,,,
2,unpasteurized cow's milk,France,Burgundy,,"semi-soft, artisan, brined",,,,"creamy, dense and smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,,
3,pasteurized cow's milk,France,province of Brittany,,semi-hard,,,,soft,washed,pale yellow,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,,
4,pasteurized cow's milk,France,Nord-Pas-de-Calais,,"semi-soft, artisan, brined",,50%,,smooth and supple,washed,pale yellow,"milky, salty",floral,no,Abbaye du Mont des Cats,,
5,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,Cheddar,semi-hard,,,,"creamy, crumbly, dense and semi firm",natural,pale yellow,"mild, sweet, tangy",aromatic,yes,Wensleydale Creamery,"English Cheddar with Caramelized Onions, Caram...",
6,sheep's milk,Czech Republic,Karlovy Vary,,"hard, artisan",,45%,,firm,natural,pale yellow,"acidic, strong, tangy",,no,,,
7,unpasteurized cow's milk,France,"Haute-Savoie, Abondance",Tomme,"semi-hard, artisan",,48%,,"creamy, open and supple",natural,pale yellow,"acidic, buttery, fruity, sweet",nutty,no,,,Tomme d'Abondance
8,goat's milk,United States,California,,"soft, soft-ripened",,,,,,,buttery,"fresh, herbal",no,Andante Dairy,,
9,Buffalo's and cow's milk,Italy,Campania,,semi-hard,,,,firm,natural,pale yellow,sweet,"aromatic, fresh",no,Casa Madaio,,


In [54]:
chz_df3.columns = ['Made from', 'Country of origin', 'Region', 'Family', 'Type', 'Fat content (in dry matter)', 
                   'Fat content', 'Calcium content', 'Texture', 'Rind', 'Colour', 'Flavour', 'Aroma', 
                   'Vegetarian', 'Producers', 'Synonyms', 'Alternative spellings']




In [55]:
chz_df3

Unnamed: 0,Made from,Country of origin,Region,Family,Type,Fat content (in dry matter),Fat content,Calcium content,Texture,Rind,Colour,Flavour,Aroma,Vegetarian,Producers,Synonyms,Alternative spellings
0,unpasteurized cow's and sheep's milk,France,Pays Basque,,"semi-hard, artisan",,,,"creamy, dense and firm",natural,yellow,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye Notre-Dame de Belloc,
1,cow's milk,France,,,semi-hard,,40-46%,,elastic,washed,ivory,,aromatic,no,,,
2,unpasteurized cow's milk,France,Burgundy,,"semi-soft, artisan, brined",,,,"creamy, dense and smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,,
3,pasteurized cow's milk,France,province of Brittany,,semi-hard,,,,soft,washed,pale yellow,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,,
4,pasteurized cow's milk,France,Nord-Pas-de-Calais,,"semi-soft, artisan, brined",,50%,,smooth and supple,washed,pale yellow,"milky, salty",floral,no,Abbaye du Mont des Cats,,
5,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,Cheddar,semi-hard,,,,"creamy, crumbly, dense and semi firm",natural,pale yellow,"mild, sweet, tangy",aromatic,yes,Wensleydale Creamery,"English Cheddar with Caramelized Onions, Caram...",
6,sheep's milk,Czech Republic,Karlovy Vary,,"hard, artisan",,45%,,firm,natural,pale yellow,"acidic, strong, tangy",,no,,,
7,unpasteurized cow's milk,France,"Haute-Savoie, Abondance",Tomme,"semi-hard, artisan",,48%,,"creamy, open and supple",natural,pale yellow,"acidic, buttery, fruity, sweet",nutty,no,,,Tomme d'Abondance
8,goat's milk,United States,California,,"soft, soft-ripened",,,,,,,buttery,"fresh, herbal",no,Andante Dairy,,
9,Buffalo's and cow's milk,Italy,Campania,,semi-hard,,,,firm,natural,pale yellow,sweet,"aromatic, fresh",no,Casa Madaio,,


In [56]:
chz_df3.iloc[0]['Made from']

"unpasteurized cow's and sheep's milk"

In [57]:
len(all_orig_names)

1828

In [58]:
len(chz_df3)

1828

In [59]:
# save progress in csv
#chz_df3.to_csv('./data/nameless_clean_big_cheese.csv', index=False)

In [60]:
chz_df3['Name'] = all_orig_names

In [61]:
chz_df3

Unnamed: 0,Made from,Country of origin,Region,Family,Type,Fat content (in dry matter),Fat content,Calcium content,Texture,Rind,Colour,Flavour,Aroma,Vegetarian,Producers,Synonyms,Alternative spellings,Name
0,unpasteurized cow's and sheep's milk,France,Pays Basque,,"semi-hard, artisan",,,,"creamy, dense and firm",natural,yellow,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye Notre-Dame de Belloc,,Abbaye de Belloc
1,cow's milk,France,,,semi-hard,,40-46%,,elastic,washed,ivory,,aromatic,no,,,,Abbaye de Belval
2,unpasteurized cow's milk,France,Burgundy,,"semi-soft, artisan, brined",,,,"creamy, dense and smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,,,Abbaye de Citeaux
3,pasteurized cow's milk,France,province of Brittany,,semi-hard,,,,soft,washed,pale yellow,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,,,Abbaye de Timadeuc
4,pasteurized cow's milk,France,Nord-Pas-de-Calais,,"semi-soft, artisan, brined",,50%,,smooth and supple,washed,pale yellow,"milky, salty",floral,no,Abbaye du Mont des Cats,,,Abbaye du Mont des Cats
5,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,Cheddar,semi-hard,,,,"creamy, crumbly, dense and semi firm",natural,pale yellow,"mild, sweet, tangy",aromatic,yes,Wensleydale Creamery,"English Cheddar with Caramelized Onions, Caram...",,Abbot’s Gold
6,sheep's milk,Czech Republic,Karlovy Vary,,"hard, artisan",,45%,,firm,natural,pale yellow,"acidic, strong, tangy",,no,,,,Abertam
7,unpasteurized cow's milk,France,"Haute-Savoie, Abondance",Tomme,"semi-hard, artisan",,48%,,"creamy, open and supple",natural,pale yellow,"acidic, buttery, fruity, sweet",nutty,no,,,Tomme d'Abondance,Abondance
8,goat's milk,United States,California,,"soft, soft-ripened",,,,,,,buttery,"fresh, herbal",no,Andante Dairy,,,Acapella
9,Buffalo's and cow's milk,Italy,Campania,,semi-hard,,,,firm,natural,pale yellow,sweet,"aromatic, fresh",no,Casa Madaio,,,Accasciato


In [62]:
# reorder so that Name is the first column
chz_df3 = chz_df3[['Name', 'Made from', 'Country of origin', 'Region', 'Family', 'Type', 'Fat content (in dry matter)', 
                   'Fat content', 'Calcium content', 'Texture', 'Rind', 'Colour', 'Flavour', 'Aroma', 
                   'Vegetarian', 'Producers', 'Synonyms', 'Alternative spellings']]

In [64]:
chz_df3

Unnamed: 0,Name,Made from,Country of origin,Region,Family,Type,Fat content (in dry matter),Fat content,Calcium content,Texture,Rind,Colour,Flavour,Aroma,Vegetarian,Producers,Synonyms,Alternative spellings
0,Abbaye de Belloc,unpasteurized cow's and sheep's milk,France,Pays Basque,,"semi-hard, artisan",,,,"creamy, dense and firm",natural,yellow,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye Notre-Dame de Belloc,
1,Abbaye de Belval,cow's milk,France,,,semi-hard,,40-46%,,elastic,washed,ivory,,aromatic,no,,,
2,Abbaye de Citeaux,unpasteurized cow's milk,France,Burgundy,,"semi-soft, artisan, brined",,,,"creamy, dense and smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,,
3,Abbaye de Timadeuc,pasteurized cow's milk,France,province of Brittany,,semi-hard,,,,soft,washed,pale yellow,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,,
4,Abbaye du Mont des Cats,pasteurized cow's milk,France,Nord-Pas-de-Calais,,"semi-soft, artisan, brined",,50%,,smooth and supple,washed,pale yellow,"milky, salty",floral,no,Abbaye du Mont des Cats,,
5,Abbot’s Gold,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,Cheddar,semi-hard,,,,"creamy, crumbly, dense and semi firm",natural,pale yellow,"mild, sweet, tangy",aromatic,yes,Wensleydale Creamery,"English Cheddar with Caramelized Onions, Caram...",
6,Abertam,sheep's milk,Czech Republic,Karlovy Vary,,"hard, artisan",,45%,,firm,natural,pale yellow,"acidic, strong, tangy",,no,,,
7,Abondance,unpasteurized cow's milk,France,"Haute-Savoie, Abondance",Tomme,"semi-hard, artisan",,48%,,"creamy, open and supple",natural,pale yellow,"acidic, buttery, fruity, sweet",nutty,no,,,Tomme d'Abondance
8,Acapella,goat's milk,United States,California,,"soft, soft-ripened",,,,,,,buttery,"fresh, herbal",no,Andante Dairy,,
9,Accasciato,Buffalo's and cow's milk,Italy,Campania,,semi-hard,,,,firm,natural,pale yellow,sweet,"aromatic, fresh",no,Casa Madaio,,


In [None]:
#save progress in csv
#chz_df3.to_csv('./data/clean_big_cheese_w_names.csv', index=False)

In [14]:
#read cheese_posts.csv into 'train'
chz_df3 = pd.read_csv('./data/clean_big_cheese_w_names.csv')

In [15]:
chz_df3

Unnamed: 0,Name,Made from,Country of origin,Region,Family,Type,Fat content (in dry matter),Fat content,Calcium content,Texture,Rind,Colour,Flavour,Aroma,Vegetarian,Producers,Synonyms,Alternative spellings
0,Abbaye de Belloc,unpasteurized cow's and sheep's milk,France,Pays Basque,,"semi-hard, artisan",,,,"creamy, dense and firm",natural,yellow,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye Notre-Dame de Belloc,
1,Abbaye de Belval,cow's milk,France,,,semi-hard,,40-46%,,elastic,washed,ivory,,aromatic,no,,,
2,Abbaye de Citeaux,unpasteurized cow's milk,France,Burgundy,,"semi-soft, artisan, brined",,,,"creamy, dense and smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,,
3,Abbaye de Timadeuc,pasteurized cow's milk,France,province of Brittany,,semi-hard,,,,soft,washed,pale yellow,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,,
4,Abbaye du Mont des Cats,pasteurized cow's milk,France,Nord-Pas-de-Calais,,"semi-soft, artisan, brined",,50%,,smooth and supple,washed,pale yellow,"milky, salty",floral,no,Abbaye du Mont des Cats,,
5,Abbot’s Gold,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,Cheddar,semi-hard,,,,"creamy, crumbly, dense and semi firm",natural,pale yellow,"mild, sweet, tangy",aromatic,yes,Wensleydale Creamery,"English Cheddar with Caramelized Onions, Caram...",
6,Abertam,sheep's milk,Czech Republic,Karlovy Vary,,"hard, artisan",,45%,,firm,natural,pale yellow,"acidic, strong, tangy",,no,,,
7,Abondance,unpasteurized cow's milk,France,"Haute-Savoie, Abondance",Tomme,"semi-hard, artisan",,48%,,"creamy, open and supple",natural,pale yellow,"acidic, buttery, fruity, sweet",nutty,no,,,Tomme d'Abondance
8,Acapella,goat's milk,United States,California,,"soft, soft-ripened",,,,,,,buttery,"fresh, herbal",no,Andante Dairy,,
9,Accasciato,Buffalo's and cow's milk,Italy,Campania,,semi-hard,,,,firm,natural,pale yellow,sweet,"aromatic, fresh",no,Casa Madaio,,
