# The Digital Cheese Sommelier

## EDA and Cleaning

In [1]:
import pandas as pd 
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import unicodedata

import warnings
warnings.filterwarnings('ignore')


### Importing and checking the data

In [2]:
#read big_cheese2.csv into 'cheese_com'
cheese_com = pd.read_csv('./data/big_cheese2.csv')

In [3]:
# check the dataframe
cheese_com.head()

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Colour,Flavour,Aroma,Vegetarian,Producers,Synonyms,Description,Fat content,Family,Alternative spellings,Calcium content,Fat content (in dry matter)
0,Abbaye de Belloc,unpasteurized cow's and sheep's milk,France,Pays Basque,"semi-hard, artisan","creamy, dense and firm",natural,yellow,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye Notre-Dame de Belloc,Abbaye de Belloc is also known as 'Abbaye Notr...,,,,,
1,Abbaye de Belval,cow's milk,France,,semi-hard,elastic,washed,ivory,,aromatic,no,,,This cheese is also known as Le Trappiste de B...,40-46%,,,,
2,Abbaye de Citeaux,unpasteurized cow's milk,France,Burgundy,"semi-soft, artisan, brined","creamy, dense and smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,,The Abbaye de Citeaux cheese comes from the Ci...,,,,,
3,Abbaye de Timadeuc,pasteurized cow's milk,France,province of Brittany,semi-hard,soft,washed,pale yellow,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,,"Being direct descendant of the Port du Salut, ...",,,,,
4,Abbaye du Mont des Cats,pasteurized cow's milk,France,Nord-Pas-de-Calais,"semi-soft, artisan, brined",smooth and supple,washed,pale yellow,"milky, salty",floral,no,Abbaye du Mont des Cats,,The Abbaye du Mont des Cats cheese is made by ...,50%,,,,


In [4]:
# took Adi Bronshtein's function for conducting exploratory data analysis and added spaces between each eda test
# using blank print statesments and \n for easier readability
def eda(dataframe):
    print("missing values: \n{}".format(dataframe.isnull().sum()))
    print()
    print("dataframe index: \n{}".format(dataframe.index))
    print()
    print("dataframe types: \n{}".format(dataframe.dtypes))
    print()
    print("dataframe shape: \n{}".format(dataframe.shape))
    print()
    print("dataframe describe: \n{}".format(dataframe.describe()))
    print()

    print("number of unique values in each column: ")
    for item in dataframe:
        print(item)
        print(dataframe[item].nunique())

In [5]:
# take a look at the EDA of the data
eda(cheese_com)

missing values: 
Name                              0
Milk                             55
Country of origin                17
Region                          238
Type                             24
Texture                         105
Rind                            238
Colour                          175
Flavour                          94
Aroma                           354
Vegetarian                      526
Producers                       402
Synonyms                       1452
Description                       0
Fat content                    1235
Family                         1037
Alternative spellings          1620
Calcium content                1779
Fat content (in dry matter)    1817
dtype: int64

dataframe index: 
RangeIndex(start=0, stop=1827, step=1)

dataframe types: 
Name                           object
Milk                           object
Country of origin              object
Region                         object
Type                           object
Texture            

## Cleaning and Formatting

In [6]:
# there are blank spaces at the end of every name value.  here is the code to strip them
# this works because all of the features are currently strings
cheese_com[cheese_com.columns] = cheese_com.apply(lambda x: x.str.strip())

### Fat Content  

Notice that for every row that has an entry for 'Fat content (in dry matter)', there is also an entry for "Fat content".  Also notice that in dry matter is almost twice that to just fat content.  This is because "Fat Content" actually represents "butterfat content", which differentiates between double cream, triple cream, and what I will call "normal fat" cheeses
 
(reference:  "Cheese Primer" by Steven Jenkins p. 16)

In [7]:
# notice that for every row that has an entry for 'Fat content (in dry matter)', there is also an entry for
# "Fat content".  Also notice that in dry matter is almost twice that to just fat content.  This is because
# "Fat Content" actually represents "butterfat content", which differentiates between doublecream,
# triple cream, and what i will call "normal fat" cheeses
# reference:  "Cheese Primer" by Steven Jenkins p. 16
cheese_com[cheese_com['Fat content (in dry matter)'].notnull()]#.unique()

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Colour,Flavour,Aroma,Vegetarian,Producers,Synonyms,Description,Fat content,Family,Alternative spellings,Calcium content,Fat content (in dry matter)
26,Aged Gouda,"pasteurized or unpasteurized cow's, goat's or ...",Netherlands,"South Holland, Gouda","hard, artisan, brined, processed","brittle, crumbly, crystalline and flaky",waxed,yellow,"burnt caramel, full-flavored","rich, ripe",no,,"Jong belegen Gouda, Belegen Gouda, Extra beleg...","Gouda, or ""How-da"" as the locals say, is a Dut...",31 g/100g,Gouda,,958 mg/100g,76%
147,Beemster 2% Milk,pasteurized cow's milk,"Canada, Denmark, France, Germany, Holland and ...",,semi-soft,smooth,,,nutty,"aromatic, floral, fruity",no,"Beemster Cheese, CONO Kaasmakers / Beemster",,Beemster 2% Milk is made using 2% milk and has...,5 g/100g,,,,8%
469,Cottage Cheese,pasteurized or unpasteurized cow's milk,United Kingdom and United States,,"soft, artisan, processed",creamy and crumbly,rindless,white,sweet,,,,,Cottage cheese has a mild flavour and is a che...,4.30 g/100g,Cottage,,68 mg/100g,4%
485,Cream Havarti,pasteurized cow's milk,Denmark,,"semi-soft, processed",smooth,rindless,pale yellow,"buttery, creamy, sweet",sweet,yes,,"Flødeis Havarti, Smoked Havarti, Havarathi","Denmark's most famous cheese, Cream Havarti is...",11.0 g/100g,Havarti,Flodeis Havarti,150 mg/100g,45%
622,Feta,pasteurized or unpasteurized goat's and sheep'...,Greece,"Macedonia, Thrace, Thessaly, Central Mainland ...","soft, brined","creamy, crumbly, grainy and open",,white,"full-flavored, salty, tangy","nutty, strong",no,CHRISTAKIS Greek Cheeses,,Feta is undoubtedly one of the most famous Gre...,21 g/100g,Feta,,493 mg/100g,16%
729,Gouda,"pasteurized or unpasteurized cow's, goat's or ...",Netherlands,"South Holland, Gouda","semi-hard, artisan, brined, processed","compact, crumbly, dense and springy",waxed,yellow,"creamy, full-flavored, nutty, sweet",pungent,no,"FrieslandCampina, Uniekaas Nederland B.V.","Boerenkass Gouda, Graskaas Gouda, Jong Gouda","Gouda, or ""How-da"" as the locals say, is a Dut...",31 g/100g,Gouda,Goudam,958 mg/100g,76%
964,Limburger,pasteurized cow's milk,"Belgium, Germany and Netherlands",Duchy of Limburg,"semi-soft, smear-ripened","creamy, crumbly, firm and smooth",washed,straw,"grassy, mild, mushroomy",stinky,no,,,"Limburger is a semi-soft, washed rind cheese t...",27 g/100g,,Limburger Kase,497 mg/100g,42%
1106,Montsalvat,pasteurized cow's milk,Germany,Landshut,"soft, blue-veined",creamy and soft,,ivory,"creamy, mild",,,Bayerische Milchindustrie eG,,Paladin Montsalvat is a German blue cheese mad...,35 g/100g,Blue,,,60%
1308,Petida,cow's milk,Germany,,"soft, brined",creamy,artificial,white,"mild, milky","clean, fresh",yes,Bergader Privatkäserei GmbH,,Petida is a Mediterranean-style soft cheese fr...,25.6 g/100g,,,190 mg/100g,55%
1425,ReginaBlu,pasteurized cow's milk,Germany,Landshut,"soft, blue-veined",creamy,,ivory,"creamy, mild",aromatic,,Bayerische Milchindustrie eG,,Paladin ReginaBlu is a German blue cheese famo...,36 g/100g,Blue,,,65%


In [8]:
# drop columns that provide no value to this project
cheese_com.drop(columns=['Fat content (in dry matter)', 'Synonyms', 'Alternative spellings', 'Colour', 'Calcium content'], inplace=True)

In [9]:
len(cheese_com)

1827

In [10]:
# note that these are all in strings, some are ranges, some are marked as number of grams out of 100 grams
cheese_com['Fat content'].unique()

array([nan, '40-46%', '50%', '45%', '48%', '52%', '12%', '14%', '55%',
       '59%', '31 g/100g', '34%', '33.5  g/100g', '40-50%', '45-60%',
       '30%', '18-20%', '46%', '30-40%', '26%', '8%', '20%',
       '54.23 g/100g', '25%', '34-48%', '40-45%', '10%', '32%', '43%',
       '69%', '28.5%', '22%', '28%', '25.22 g/100g', '20-30%', '11%',
       '25.5%', '43.3-26.5 g/100g', '62%', '5 g/100g', '45-50%', '70%',
       '15-25%', '39.6 g/100g', '32.5 g/100g', '40%', '37%', '54%',
       '6 g/100g', '31%', '43.3 g/100g', '21%', '75%', '60%', '35%',
       '7 g/100g', '8.4 g/100g', '27 g/100g', '46-60%', '8 g/100g', '67%',
       '13%', '34.2 g/100g', '27%', '9 g/100g', '24%', '33%',
       '4.30 g/100g', '65%', '11.0 g/100g', '51%', '31.0 g/100g',
       '25-30%', '15-45%', '22-28 g/100g', '34.5 g/100g', '17%', '22 %',
       '15%', '30.5%', '26-28%', '21 g/100g', '72%', '17.1 g/100g',
       '14.6 g/100g', '22 g/100g', '10.1 g/100g', '16%', '1.0%',
       '11 g/100g', '25-35%', '30-45%',

In [11]:
# change the fat content from strings to a single float for each entry

# to avoid errors, fill null values of 'Fat content' with the impossible numerical value of '99999' 
# (as a string, because that is the format of 'Fat content') 
cheese_com['Fat content'] = cheese_com['Fat content'].fillna('99999')

# these are all percentages.  I want to take out the '%' signs, 
# and if it's measured in 'g/100g', replace that as well
Fat_list = []
for i in cheese_com['Fat content']:
    Fat_list.append(i.replace('%', '').replace('g', '').replace('/100', ''))
    
    # when the fat content is listed as a range (number1 - number2), I decided to take the average
    # i did this by substituting the '-' for a '+' sign, using eval() to evaluate math operations from the strings
    # and then dividing the value by 2
    new_list = []
    for i in Fat_list:

        if "-" in i:
            i = i.replace('-', '+')
            i = eval(i)/2
        # when the fat content is not listed as a range, just use eval() to evaluate 
        # ratios that were written as fractions in grams into decimals    
        else:
            i = eval(i)
        new_list.append(i) # eval() evaluates math operations from strings
        
# replace the content in the feature 'Fat content' with the list created above
cheese_com['Fat content'] = new_list

In [12]:
# check to see that fat content was successfully converted to floats
cheese_com[cheese_com['Fat content'] == 75]

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,Producers,Description,Fat content,Family
232,Boursault,pasteurized cow's milk,France,Val-de-Marne,"soft, soft-ripened","creamy, smooth and spreadable",bloomy,"buttery, citrusy, nutty, salty",earthy,no,,Originating in the French region of Val-de-Mar...,75.0,Brie
255,Brillat-Savarin,pasteurized cow's milk,France,Ile de France,"semi-soft, artisan",creamy and dense,mold ripened,"buttery, nutty, sour",milky,no,"Fromagerie Lincet, Fromagerie Rouzaire",Brillat-Savarin is a triple cream dessert chee...,75.0,
486,Creamy Lancashire,pasteurized or unpasteurized cow's milk,England,Lancanshire,"fresh soft, artisan","creamy, fluffy and smooth",natural,"buttery, creamy, smooth",rich,no,"Butlers Farmhouse Cheeses, Dewlay Products Ltd...",Lancanshire cheese has been made in Lancanshir...,75.0,
538,Delice de Bourgogne,pasteurized cow's milk,France,Burgundy,"soft, soft-ripened",creamy and smooth,bloomy,"buttery, mushroomy, smooth, tangy","mushroom, pungent, strong",no,Fromagerie Lincet,Delice de Bourgogne is a French classic triple...,75.0,Brie
613,Explorateur,cow's milk,France,Île-de-France,soft,smooth and soft-ripened,bloomy,mild,,no,Fromagerie du Petit Morin,L'Explorateur is a soft-ripened French cow's m...,75.0,
631,Finn,unpasteurized cow's milk,England and United Kingdom,Herefordshire,"soft, artisan",creamy and firm,bloomy,"buttery, nutty, salty, sweet","earthy, fresh",no,Neal's Yard Dairy,"Finn is an unpasteurised, soft-white cheese pr...",75.0,
744,Grand Vatel,pasteurized cow's milk,France,Val-de-Marne,"soft, soft-ripened","creamy, smooth and spreadable",bloomy,"buttery, citrusy, nutty, salty",earthy,no,,Originating in the French region of Val-de-Mar...,75.0,Brie
1480,Saint-André,pasteurized or unpasteurized cow's milk,France,"Coutances, Normandy","soft, soft-ripened",creamy and dense,bloomy,"buttery, salty, sour, tangy","mild, rich",yes,,Saint Andre is a triple crème cow's milk chees...,75.0,Brie
1757,Vignotte,pasteurized cow's milk,France,Normandy,soft,"creamy, smooth and spreadable",bloomy,"buttery, lemony",strong,yes,,"Vignotte is a high fat, triple cream French ch...",75.0,Brie


In [13]:
# now that fat content is in numerical form, let's categorize it by the range of fat percentage, 
# by way of categorical dummy columns

# instantiate each new dummy feature as an int at 0
cheese_com['Fat_less than 0.15%'] = 0
cheese_com['Fat_0.16-3%'] = 0
cheese_com['Fat_4-19%'] = 0
cheese_com['Fat_20-39%'] = 0
cheese_com['Fat_40-59%'] = 0
cheese_com['Fat_60-74%'] = 0
cheese_com['Fat_75%+'] = 0


# run a for loop over all of the cheeses.  then an if statement that puts a 1 in a dummy column if it lies within
# the specified numerical range
for i in range(len(cheese_com)):
    # check the fat content numerical value
    if cheese_com['Fat content'][i] <= 0.15:
        # set appropriate dummy column to 1
        cheese_com['Fat_less than 0.15%'][i] = 1
        
    # check the fat content numerical value    
    if cheese_com['Fat content'][i] >= 0.16 and cheese_com['Fat content'][i] <= 3:
        # set appropriate dummy column to 1
        cheese_com['Fat_0.16-3%'][i] = 1
    
    # check the fat content numerical value
    if cheese_com['Fat content'][i] >= 4 and cheese_com['Fat content'][i] <= 19:
        # set appropriate dummy column to 1
        cheese_com['Fat_4-19%'][i] = 1
        
    
    # check the fat content numerical value
    if cheese_com['Fat content'][i] >= 20 and cheese_com['Fat content'][i] <= 39:
        # set appropriate dummy column to 1
        cheese_com['Fat_20-39%'][i] = 1
        
    # check the fat content numerical value    
    if cheese_com['Fat content'][i] >= 40 and cheese_com['Fat content'][i] <= 59:
        # set appropriate dummy column to 1
        cheese_com['Fat_40-59%'][i] = 1
        
    # check the fat content numerical value    
    if cheese_com['Fat content'][i] >= 60 and cheese_com['Fat content'][i] <= 74:
        # set appropriate dummy column to 1
        cheese_com['Fat_60-74%'][i] = 1
    
    # check the fat content numerical value
    if cheese_com['Fat content'][i] >= 75 and cheese_com['Fat content'][i] <= 100:
        # set appropriate dummy column to 1
        cheese_com['Fat_75%+'][i] = 1
        
        

In [14]:
# check to see that the fat content values were dummied appropriately
cheese_com[cheese_com['Fat_0.16-3%'] == 1]

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Description,Fat content,Family,Fat_less than 0.15%,Fat_0.16-3%,Fat_4-19%,Fat_20-39%,Fat_40-59%,Fat_60-74%,Fat_75%+
697,Gammelost,cow's milk,Norway,Hardanger and Sogn,"hard, blue-veined","dense, firm and grainy",mold ripened,sharp,aromatic,no,...,"Gammelost, also spelt as Gamalost or Gammalost...",1.0,Blue,0,1,0,0,0,0,0
1184,Olomoucké Tvarůžky,pasteurized cow's milk,Czech Republic,Ostrava,"soft, soft-ripened",crumbly and soft,,"pungent, spicy",strong,,...,"Olomoucké tvarůžky is a ripped soft, yellowish...",0.6,,0,1,0,0,0,0,0
1500,Sap Sago,,,Canton of Glarus,,,natural,,,yes,...,"Schabziger, sold under the name sapsago in the...",0.8,,0,1,0,0,0,0,0


### Milk Type and Treatment

In [15]:
# function to strip strings of accents, from BartoszKP and oefe on StackOverflow
# https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string

import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')


In [16]:
# There are some null values in the "Milk" column, so I decided to write some code to search through the
# "Description" columns and see if there were any mentions of milk type or treatment


# instantiate each new dummy feature as an int at 0
cheese_com['Milk_vegan'] = 0

cheese_com['Milk_cow'] = 0
cheese_com['Milk_goat'] = 0
cheese_com['Milk_sheep'] = 0
cheese_com['Milk_buffalo'] = 0
cheese_com['Milk_camel'] = 0
cheese_com['Milk_donkey'] = 0
cheese_com['Milk_mare'] = 0
cheese_com['Milk_moose'] = 0
cheese_com['Milk_reindeer'] = 0
cheese_com['Milk_water buffalo'] = 0
cheese_com['Milk_yak'] = 0

cheese_com['Treatment_pasteurized'] = 0
cheese_com['Treatment_unpasteurized'] = 0


# count = 0

# go through the entire dataframe of cheeses
for i in range(len(cheese_com)):
    # look at each cheese where the 'Milk' feature is null
    if cheese_com['Milk'].isnull()[i]:
        

        # check the "description" for mentions of this descriptive string
        if " vegan" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for vegan 'milk' as 1
            cheese_com['Milk_vegan'][i] = 1
            
            
        # check the "description" for mentions of this descriptive string  
        if "non dairy" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for vegan 'milk' as 1
            cheese_com['Milk_vegan'][i] = 1
            
        # check the "description" for mentions of this descriptive string
        if "dairy free" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for vegan 'milk' as 1
            cheese_com['Milk_vegan'][i] = 1
            
            
        
        # check the "description" for mentions of this descriptive string
        if "cow" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for cow milk as 1
            cheese_com['Milk_cow'][i] = 1

            
        # check the "description" for mentions of this descriptive string
        if "goat" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for goat milk as 1
            cheese_com['Milk_goat'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "sheep" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for sheep milk as 1
            cheese_com['Milk_sheep'][i] = 1

        # check the "description" for mentions of this descriptive string
        # note that i only want 'buffalo' and not 'water buffalo'
        if "buffalo" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')) and "water" not in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for buffalo milk as 1
            cheese_com['Milk_buffalo'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "camel" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for camel milk as 1
            cheese_com['Milk_camel'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "donkey" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for donkey milk as 1
            cheese_com['Milk_donkey'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "mare" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for mare milk as 1
            cheese_com['Milk_mare'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "moose" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for moose milk as 1
            cheese_com['Milk_moose'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "reindeer" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for reindeer milk as 1
            cheese_com['Milk_reindeer'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "water buffalo" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for water buffalo milk as 1
            cheese_com['Milk_water buffalo'][i] = 1

        # check the "description" for mentions of this descriptive string
        if "yak" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for yak milk as 1
            cheese_com['Milk_yak'][i] = 1
            
         
        # check the "description" for mentions of this descriptive string
        if "unpasteurized" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for unpasteurized milk as 1
            cheese_com['Treatment_unpasteurized'][i] = 1
        
        # check the "description" for mentions of this descriptive string
        # note that i only want "pasteurized" and not 'unpasteurized'
        if "pasteurized" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')) and "unpasteurized" not in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            cheese_com['Treatment_pasteurized'][i] = 1
        
        # check the "description" for mentions of this descriptive string
        # note that sometimes that either pasteurized or unpasteurized are used.  since 'unpasteurized' dummy already
        # will be changed to 1 for these, only change the 'pasteurized' dummy
        if "pasteurized or unpasteurized" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            # mark dummy column for pasteurized milk as 1
            cheese_com['Treatment_pasteurized'][i] = 1
         
            

        
#             print(cheese_com['Name'][i])
#             print(i)
#             print()
#             print(cheese_com['Description'][i].replace('cow', '\033[44;33m{}\033[m'.format('cow')))
#             print()
#             print()
#             count += 1
# print(count)

In [17]:
# test to see that the values were dummied properly
cheese_com[cheese_com['Milk_vegan'] == 1]

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_buffalo,Milk_camel,Milk_donkey,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized
19,Aged Cashew & Blue Green Algae Cheese,,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, mellow, oceanic, tangy",rich,yes,...,0,0,0,0,0,0,0,0,0,0
20,Aged Cashew & Brazil Nut Cheese,,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, nutty, sweet","nutty, rich",yes,...,0,0,0,0,0,0,0,0,0,0
22,Aged Cashew & Hemp Seed Cheese,,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, mild, nutty, spicy","nutty, rich",yes,...,0,1,0,0,0,0,0,0,0,0
23,Aged Cashew Nut Cheese,,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, nutty, subtle","clean, fresh, nutty",yes,...,0,0,0,0,0,0,0,0,0,0
24,Aged Cashew Nut & Kale Cheese,,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, oceanic, tangy",rich,yes,...,0,0,0,0,0,0,0,0,0,0
195,Blissful Blocks,,Canada and United States,,hard,creamy and crumbly,plastic,"creamy, savory, sharp, spicy",,yes,...,0,0,0,0,0,0,0,0,0,0
196,Blissful Toppings,,Canada and United States,,soft,crumbly,artificial,"savory, sharp",,yes,...,0,0,0,0,0,0,0,0,0,0
370,Cashew Nut Cream Cheese,,United States,Brooklyn NY,"soft, artisan","creamy, smooth, soft and spreadable",,"creamy, nutty, sweet","clean, fresh, nutty",yes,...,0,0,0,0,0,0,0,0,0,0
484,Cream Cheesy Bliss,,Canada and United States,,soft,creamy and spreadable,artificial,"creamy, garlicky, herbaceous, sweet",rich,yes,...,0,0,0,0,0,0,0,0,0,0
699,Garlic and Fine Herbs Cashew Cheese,,Canada,Ontario,"soft, artisan",creamy,,"creamy, full-flavored, herbaceous, nutty","herbal, nutty",yes,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# observe kinds of milk recorded
cheese_com['Milk'].unique()

array(["unpasteurized cow's and sheep's milk", "cow's milk",
       "unpasteurized cow's milk", "pasteurized cow's milk",
       "sheep's milk", "goat's milk", "Buffalo's and cow's milk",
       "pasteurized cow's, goat's or sheep's milk",
       "unpasteurized sheep's milk",
       "pasteurized or unpasteurized cow's milk", nan,
       "pasteurized goat's milk",
       "pasteurized or unpasteurized cow's, goat's or sheep's milk",
       "mare's milk", "pasteurized goat's and sheep's milk",
       "goat's and sheep's milk",
       "pasteurized cow's or water buffalo's milk",
       "pasteurized goat's or sheep's milk",
       "pasteurized cow's or goat's milk", "unpasteurized goat's milk",
       "unpasteurized goat's and sheep's milk",
       "pasteurized or unpasteurized cow's, goat's and sheep's milk",
       "unpasteurized cow's, goat's or sheep's milk",
       "unpasteurized cow's and goat's milk", "cow's and sheep's milk",
       "pasteurized sheep's milk",
       "unpasteurized 

In [19]:
# to avoid errors, fill null values of 'Milk' with the string value of 'unknown' 
cheese_com['Milk'] = cheese_com['Milk'].fillna('unknown')

# set a dummy variable when the cheese is made from a blend of animal milk
cheese_com['Milk_blend yes'] = 0

# go through the entire dataframe of cheeses
for i in range(len(cheese_com)):
    
    # change the dummy variables for the animal from which the milk came from
    
    # check the "milk" value for mentions of this descriptive string
    if "cow" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_cow'][i] = 1
        
    # check the "milk" value for mentions of this descriptive string
    if "goat" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_goat'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "sheep" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_sheep'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string (check for 'buffalo' not 'water buffalo')
    if "buffalo" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')) and "water" not in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_buffalo'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "camel" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_camel'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "donkey" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_donkey'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "mare" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_mare'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "moose" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_moose'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "reindeer" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_reindeer'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "water buffalo" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_water buffalo'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    if "yak" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_yak'][i] = 1
        
        
        
    # change the dummy variables for the if the milk came from multiple animals 
    # (if there is an 'and' in the 'Milk' column)
    if "and" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Milk_blend yes'][i] = 1
        
        
        
    # change the dummy variables for the milk treatment ('pastuerized' or 'unpasteurized')
        
    # check the "milk" value for mentions of this descriptive string    
    if "unpasteurized" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Treatment_unpasteurized'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    # look for 'pasteurized' and not 'unpasteurized'
    if "pasteurized" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')) and "unpasteurized" not in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Treatment_pasteurized'][i] = 1
    
    # check the "milk" value for mentions of this descriptive string
    # look for 'pasteurized or pasteurized'.  only need to change dummy for 'pasteurized' 
    # as 'unpasteurized' dummy will already be marked
    if "pasteurized or unpasteurized" in strip_accents(cheese_com['Milk'][i].lower().replace('-', ' ')):
        # mark the corresponding dummy column with a 1
        cheese_com['Treatment_pasteurized'][i] = 1
        


### 'Triple Cream' and 'Double Cream' Cheeses

Two of the more distinctive types of cheeses are 'Triple Cream' and 'Double Cream' Cheese, both distinguished by the amount of butterfat added to the milk.  Double Creame means that the cheese has 60-74% butterfat; Triple Cream means that 75% or more butterfat has been added to the milk

In [20]:
# instantiate a dummy for Triple Creams
cheese_com['Triple Cream'] = 0

# check the description for every cheese.  if 'Triple Cream' is mentioned, mark the dummy column 1
for i in range(len(cheese_com)):
    if "triple cream" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')) or "triple creme" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
        cheese_com['Triple Cream'][i] = 1
#         print(cheese_com['Name'][i])
#         print()
#         print(cheese_com['Description'][i])
#         print()
#         print()
    

In [21]:
# check the dummy
cheese_com[cheese_com['Triple Cream'] == 1]

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_donkey,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized,Milk_blend yes,Triple Cream
157,Belle Creme,pasteurized cow's milk,Canada,Québec,"soft, soft-ripened","buttery, creamy, smooth and soft-ripened",bloomy,"acidic, buttery, creamy, salty","mushroom, nutty",,...,0,0,0,0,0,0,1,0,0,1
158,Belletoile,pasteurized cow's milk,France,,"soft, soft-ripened",creamy,,"garlicky, mild, mushroomy, nutty, tangy",mild,no,...,0,0,0,0,0,0,1,0,0,1
232,Boursault,pasteurized cow's milk,France,Val-de-Marne,"soft, soft-ripened","creamy, smooth and spreadable",bloomy,"buttery, citrusy, nutty, salty",earthy,no,...,0,0,0,0,0,0,1,0,0,1
248,Brie Coco,pasteurized cow's milk,Canada,Québec,"soft, soft-ripened","buttery, creamy, smooth and soft-ripened",bloomy,"acidic, buttery, creamy, salty","mushroom, nutty",,...,0,0,0,0,0,0,1,0,0,1
255,Brillat-Savarin,pasteurized cow's milk,France,Ile de France,"semi-soft, artisan",creamy and dense,mold ripened,"buttery, nutty, sour",milky,no,...,0,0,0,0,0,0,1,0,0,1
321,Cambazola,pasteurized cow's milk,Germany,Allgäu,"soft, artisan, soft-ripened",creamy and smooth,bloomy,"nutty, savory, sharp, sweet",strong,yes,...,0,0,0,0,0,0,1,0,0,1
352,Capriny,pasteurized goat's milk,Canada,Quebec,soft,creamy,rindless,"creamy, mild, sharp, sour",goaty,,...,0,0,0,0,0,0,1,0,0,1
392,Champignon Mushrooom,pasteurized cow's milk,Germany,Allgäu,"soft, soft-ripened",creamy,natural,"creamy, mushroomy","fresh, mild",yes,...,0,0,0,0,0,0,1,0,0,1
483,Cream Cheese,pasteurized or unpasteurized cow's milk,United States,"Chester, New York","fresh soft, processed",creamy and spreadable,rindless,"creamy, mild, sweet","fresh, pleasant",yes,...,0,0,0,0,0,0,1,1,0,1
538,Delice de Bourgogne,pasteurized cow's milk,France,Burgundy,"soft, soft-ripened",creamy and smooth,bloomy,"buttery, mushroomy, smooth, tangy","mushroom, pungent, strong",no,...,0,0,0,0,0,0,1,0,0,1


In [22]:
# instantiate a dummy for Double Creams
cheese_com['Double Cream'] = 0

# check the description for every cheese.  if 'Triple Cream' is mentioned, mark the dummy column 1
for i in range(len(cheese_com)):
    if "double cream" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')) or "double creme" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
        cheese_com['Double Cream'][i] = 1
#         print(cheese_com['Name'][i])
#         print()
#         print(cheese_com['Description'][i])
#         print()
#         print()
    

In [23]:
# test the dummy
cheese_com[cheese_com['Double Cream'] == 1]

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized,Milk_blend yes,Triple Cream,Double Cream
249,Brie d'Alexis,pasteurized cow's milk,Canada,Quebec,"soft, soft-ripened","buttery, creamy and supple",bloomy,"creamy, nutty",nutty,,...,0,0,0,0,0,1,0,0,0,1
252,Brie de Portneuf Double Cream,pasteurized cow's milk,Canada,Quebec,"soft, soft-ripened","buttery, creamy and supple",bloomy,creamy,nutty,,...,0,0,0,0,0,1,0,0,0,1
298,Caboc,pasteurized cow's milk,Scotland,Tain,soft,creamy,natural,"buttery, creamy, nutty","fresh, nutty",yes,...,0,0,0,0,0,1,0,0,0,1
390,Champignon de Luxe Garlic,pasteurized cow's milk,Germany,Allgäu,"soft, soft-ripened",creamy,natural,"garlicky, herbaceous","herbal, spicy",yes,...,0,0,0,0,0,1,0,0,0,1
391,Champignon de Luxe Pepper,pasteurized cow's milk,Germany,Allgäu,"soft, soft-ripened",creamy,natural,"creamy, sharp",spicy,yes,...,0,0,0,0,0,1,0,0,0,1
483,Cream Cheese,pasteurized or unpasteurized cow's milk,United States,"Chester, New York","fresh soft, processed",creamy and spreadable,rindless,"creamy, mild, sweet","fresh, pleasant",yes,...,0,0,0,0,0,1,1,0,1,1
490,Cremet,unpasteurized cow's and goat's milk,"England, Great Britain and United Kingdom",Devon,"soft, artisan, soft-ripened","creamy, smooth, soft and soft-ripened",bloomy,"creamy, lemony","fresh, rich",yes,...,0,0,0,0,0,0,1,1,0,1
561,Doppelrhamstufel,cow's milk,Germany,,soft,,,salty,lactic,no,...,0,0,0,0,0,0,0,0,0,1
631,Finn,unpasteurized cow's milk,England and United Kingdom,Herefordshire,"soft, artisan",creamy and firm,bloomy,"buttery, nutty, salty, sweet","earthy, fresh",no,...,0,0,0,0,0,0,1,0,1,1
883,La Bonaparte,pasteurized cow's milk,Canada,Quebec,"soft, soft-ripened","creamy, soft, soft-ripened and supple",bloomy,"creamy, mild, milky","fresh, mild",,...,0,0,0,0,0,1,0,0,0,1


#### Fixing cases that were marked both 'double cream' and 'triple cream'

In [24]:
# shouldn't have cases of double cream AND triple cream, because it's an either/or situation
# yet there are three marked as both: Cream Cheese (483), Finn (631), and Mascapone(Australian) (1037)
mask1 = cheese_com['Triple Cream'] == 1
mask2 = cheese_com['Double Cream'] == 1
mask = mask1 & mask2

cheese_com[mask]

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized,Milk_blend yes,Triple Cream,Double Cream
483,Cream Cheese,pasteurized or unpasteurized cow's milk,United States,"Chester, New York","fresh soft, processed",creamy and spreadable,rindless,"creamy, mild, sweet","fresh, pleasant",yes,...,0,0,0,0,0,1,1,0,1,1
631,Finn,unpasteurized cow's milk,England and United Kingdom,Herefordshire,"soft, artisan",creamy and firm,bloomy,"buttery, nutty, salty, sweet","earthy, fresh",no,...,0,0,0,0,0,0,1,0,1,1
1037,Mascarpone (Australian),pasteurized cow's milk,Australia,,"fresh soft, processed","buttery, creamy, smooth, soft and spreadable",rindless,"buttery, creamy, milky, smooth, sweet, tangy","fresh, milky, sweet",yes,...,0,0,0,0,0,1,0,0,1,1


In [25]:
# Cream Cheese is neither, but triple and doubles both discussed in description
cheese_com['Description'][483]

cheese_com['Double Cream'][483] = 0
cheese_com['Triple Cream'][483] = 0


In [26]:
# Finn is a Triple cream, but the phrase "double creamy" is in the description
cheese_com['Description'][631]

cheese_com['Double Cream'][631] = 0

In [27]:
#Mascarpone (Australian) is neither double nor triple cream
cheese_com['Description'][1037]

cheese_com['Double Cream'][1037] = 0
cheese_com['Triple Cream'][1037] = 0


### Rind Values

In [28]:
# look at all the cheeses where no rind type was entered
cheese_com[cheese_com['Rind'].isnull()]

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized,Milk_blend yes,Triple Cream,Double Cream
8,Acapella,goat's milk,United States,California,"soft, soft-ripened",,,buttery,"fresh, herbal",no,...,0,0,0,0,0,0,0,0,0,0
11,Acorn,unpasteurized sheep's milk,United Kingdom,Bethania,"hard, artisan",crumbly and firm,,"burnt caramel, citrusy, herbaceous",fruity,yes,...,0,0,0,0,0,0,1,0,0,0
19,Aged Cashew & Blue Green Algae Cheese,unknown,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, mellow, oceanic, tangy",rich,yes,...,0,0,0,0,0,0,0,0,0,0
20,Aged Cashew & Brazil Nut Cheese,unknown,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, nutty, sweet","nutty, rich",yes,...,0,0,0,0,0,0,0,0,0,0
21,Aged Cashew & Dulse Cheese,unknown,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, oceanic",rich,yes,...,0,0,0,0,0,0,0,0,0,0
22,Aged Cashew & Hemp Seed Cheese,unknown,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, mild, nutty, spicy","nutty, rich",yes,...,0,0,0,0,0,0,0,0,0,0
23,Aged Cashew Nut Cheese,unknown,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, nutty, subtle","clean, fresh, nutty",yes,...,0,0,0,0,0,0,0,0,0,0
24,Aged Cashew Nut & Kale Cheese,unknown,United States,Brooklyn NY,"firm, artisan",firm and spreadable,,"creamy, oceanic, tangy",rich,yes,...,0,0,0,0,0,0,0,0,0,0
27,Aggiano,pasteurized cow's milk,United States,Utah,"hard, artisan",creamy and dry,,"butterscotch, tangy",fruity,yes,...,0,0,0,0,0,1,0,0,0,0
33,Allium Piper,pasteurized goat's milk,Australia,South Australia,"fresh soft, artisan",creamy and soft,,"garlicky, spicy","fresh, garlicky, spicy",yes,...,0,0,0,0,0,1,0,0,0,0


In [29]:
# look at different entries for rind
cheese_com['Rind'].unique()

array(['natural', 'washed', nan, 'rindless', 'cloth wrapped',
       'mold ripened', 'waxed', 'bloomy', 'artificial', 'plastic',
       'ash coated', 'leaf wrapped'], dtype=object)

In [30]:
#count = 0

# look through every cheese
for i in range(len(cheese_com)):
    
    # if the rind value is null, check the "description" value for keywords that would tell the rind type
    if cheese_com['Rind'].isnull()[i]:
        
        # check if rind was even mentioned in the description
        if "rind" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            
            # mark as "bloomy" if 'bloom' appears in 'Description'
            if "bloom" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'bloomy'
                
            # mark as "waxy" if 'wax' appears in 'Description'
            if "wax" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'waxed'
                
            # mark as "rindless" if 'rindless' appears in 'Description'
            if "rindless" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'rindless'    
                
            # mark as "natural" if 'natural' appears in 'Description'
            if "natural" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'natural'  
                
            # mark as "washed" if 'bloom' washed in 'Description'
            if "washed" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'washed'  
                
            # mark as "rindless" if 'no rind' appears in 'Description'
            if "no rind" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'rindless'
                
            # mark as "ash coated" if 'charcoal' appears in 'Description'
            if "charcoal" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'ash coated'
                
            # mark as "washed" if 'orange' appears in 'Description'
            # note: washed rind cheeses are almost always orange
            if "orange" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'washed'
                
            # 'pasta filata' cheeses are fresh and thus never have a rind
            if "filata" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'rindless'
               
            # another name for bloomy rind cheese is 'white rind'
            if "white rind" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'bloomy'
              
            # bloomy rind cheeses are made with penicillium
            if "penicill" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'bloomy'
                
            # another name for bloomy rind cheese is 'soft ripened'
            if "soft ripened" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
                cheese_com['Rind'][i] = 'bloomy'
                
#             if "cow" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
#                 #cheese_com['Rind'][i] = 'bloomy'
#                 print(cheese_com['Name'][i])
#                 print(i)
#                 print()
#                 print(cheese_com['Description'][i].replace('cow', '\033[44;33m{}\033[m'.format('cow')))
#                 print()
#                 print()
#                 count += 1
# print(count)

In [31]:
cheese_com

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized,Milk_blend yes,Triple Cream,Double Cream
0,Abbaye de Belloc,unpasteurized cow's and sheep's milk,France,Pays Basque,"semi-hard, artisan","creamy, dense and firm",natural,burnt caramel,lanoline,yes,...,0,0,0,0,0,0,1,1,0,0
1,Abbaye de Belval,cow's milk,France,,semi-hard,elastic,washed,,aromatic,no,...,0,0,0,0,0,0,0,0,0,0
2,Abbaye de Citeaux,unpasteurized cow's milk,France,Burgundy,"semi-soft, artisan, brined","creamy, dense and smooth",washed,"acidic, milky, smooth","barnyardy, earthy",no,...,0,0,0,0,0,0,1,0,0,0
3,Abbaye de Timadeuc,pasteurized cow's milk,France,province of Brittany,semi-hard,soft,washed,,,no,...,0,0,0,0,0,1,0,0,0,0
4,Abbaye du Mont des Cats,pasteurized cow's milk,France,Nord-Pas-de-Calais,"semi-soft, artisan, brined",smooth and supple,washed,"milky, salty",floral,no,...,0,0,0,0,0,1,0,0,0,0
5,Abbot’s Gold,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,semi-hard,"creamy, crumbly, dense and semi firm",natural,"mild, sweet, tangy",aromatic,yes,...,0,0,0,0,0,1,0,0,0,0
6,Abertam,sheep's milk,Czech Republic,Karlovy Vary,"hard, artisan",firm,natural,"acidic, strong, tangy",,no,...,0,0,0,0,0,0,0,0,0,0
7,Abondance,unpasteurized cow's milk,France,"Haute-Savoie, Abondance","semi-hard, artisan","creamy, open and supple",natural,"acidic, buttery, fruity, sweet",nutty,no,...,0,0,0,0,0,0,1,0,0,0
8,Acapella,goat's milk,United States,California,"soft, soft-ripened",,,buttery,"fresh, herbal",no,...,0,0,0,0,0,0,0,0,0,0
9,Accasciato,Buffalo's and cow's milk,Italy,Campania,semi-hard,firm,natural,sweet,"aromatic, fresh",no,...,0,0,0,0,0,0,0,1,0,0


### Blue Cheeses
Another distinct type of cheese is "blue cheese

In [32]:
#count = 0

# for every cheese with 'Family' marked as null, check 'Description' to see if indicators that it's a blue cheese
for i in range(len(cheese_com)):
    if cheese_com['Family'].isnull()[i]:
        
        if "bleu" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            cheese_com['Family'][i] = 'blue'
            
        if "blue vein" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')) and "white" not in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
            cheese_com['Family'][i] = 'blue'
            
#         if "cheddar" in strip_accents(cheese_com['Description'][i].lower().replace('-', ' ')):
#             #cheese_com['Family'][i] = 'blue'
#             print(cheese_com['Name'][i])
#             print()
#             print(cheese_com['Description'][i].replace('cheddar', '\033[44;33m{}\033[m'.format('cheddar')))
#             print()
#             print(i)
#             count += 1
# print(count)    

### rando dumb entry

In [33]:
cheese_com[cheese_com["Country of origin"] == 'countries throughout the world']

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized,Milk_blend yes,Triple Cream,Double Cream
208,Blue Vein Cheese,"pasteurized or unpasteurized cow's, goat's or ...",countries throughout the world,,"semi-soft, blue-veined",creamy,,"salty, sharp, tangy","stinky, strong",no,...,0,0,0,0,0,1,1,0,0,0


In [34]:
# drop the entry for "Blue Vein Cheese" from 'countries throughout the world', 
# as it is too vague to be thought of as a single cheese
cheese_com = cheese_com[cheese_com["Country of origin"] != 'countries throughout the world']

# reset the index after dropping the entry
cheese_com.reset_index(drop=True)

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,...,Milk_mare,Milk_moose,Milk_reindeer,Milk_water buffalo,Milk_yak,Treatment_pasteurized,Treatment_unpasteurized,Milk_blend yes,Triple Cream,Double Cream
0,Abbaye de Belloc,unpasteurized cow's and sheep's milk,France,Pays Basque,"semi-hard, artisan","creamy, dense and firm",natural,burnt caramel,lanoline,yes,...,0,0,0,0,0,0,1,1,0,0
1,Abbaye de Belval,cow's milk,France,,semi-hard,elastic,washed,,aromatic,no,...,0,0,0,0,0,0,0,0,0,0
2,Abbaye de Citeaux,unpasteurized cow's milk,France,Burgundy,"semi-soft, artisan, brined","creamy, dense and smooth",washed,"acidic, milky, smooth","barnyardy, earthy",no,...,0,0,0,0,0,0,1,0,0,0
3,Abbaye de Timadeuc,pasteurized cow's milk,France,province of Brittany,semi-hard,soft,washed,,,no,...,0,0,0,0,0,1,0,0,0,0
4,Abbaye du Mont des Cats,pasteurized cow's milk,France,Nord-Pas-de-Calais,"semi-soft, artisan, brined",smooth and supple,washed,"milky, salty",floral,no,...,0,0,0,0,0,1,0,0,0,0
5,Abbot’s Gold,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,semi-hard,"creamy, crumbly, dense and semi firm",natural,"mild, sweet, tangy",aromatic,yes,...,0,0,0,0,0,1,0,0,0,0
6,Abertam,sheep's milk,Czech Republic,Karlovy Vary,"hard, artisan",firm,natural,"acidic, strong, tangy",,no,...,0,0,0,0,0,0,0,0,0,0
7,Abondance,unpasteurized cow's milk,France,"Haute-Savoie, Abondance","semi-hard, artisan","creamy, open and supple",natural,"acidic, buttery, fruity, sweet",nutty,no,...,0,0,0,0,0,0,1,0,0,0
8,Acapella,goat's milk,United States,California,"soft, soft-ripened",,,buttery,"fresh, herbal",no,...,0,0,0,0,0,0,0,0,0,0
9,Accasciato,Buffalo's and cow's milk,Italy,Campania,semi-hard,firm,natural,sweet,"aromatic, fresh",no,...,0,0,0,0,0,0,0,1,0,0


In [35]:
# do EDA again
eda(cheese_com)

missing values: 
Name                          0
Milk                          0
Country of origin            17
Region                      237
Type                         24
Texture                     105
Rind                        213
Flavour                      94
Aroma                       354
Vegetarian                  526
Producers                   401
Description                   0
Fat content                   0
Family                     1027
Fat_less than 0.15%           0
Fat_0.16-3%                   0
Fat_4-19%                     0
Fat_20-39%                    0
Fat_40-59%                    0
Fat_60-74%                    0
Fat_75%+                      0
Milk_vegan                    0
Milk_cow                      0
Milk_goat                     0
Milk_sheep                    0
Milk_buffalo                  0
Milk_camel                    0
Milk_donkey                   0
Milk_mare                     0
Milk_moose                    0
Milk_reindeer          

In [36]:
#save progress in csv
#cheese_com.to_csv('./data/model_testing2.csv', index=False)

In [37]:
#read big_cheese2.csv into 'cheese_com'
cheese_com = pd.read_csv('./data/model_testing.csv')

In [38]:
# check to make sure progress is saving
cheese_com.head()

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Flavour,Aroma,Vegetarian,Producers,Description,Fat content,Family,Triple Cream,Double Cream
0,Abbaye de Belloc,unpasteurized cow's and sheep's milk,France,Pays Basque,"semi-hard, artisan","creamy, dense and firm",natural,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye de Belloc is also known as 'Abbaye Notr...,99999.0,,0,0
1,Abbaye de Belval,cow's milk,France,,semi-hard,elastic,washed,,aromatic,no,,This cheese is also known as Le Trappiste de B...,43.0,,0,0
2,Abbaye de Citeaux,unpasteurized cow's milk,France,Burgundy,"semi-soft, artisan, brined","creamy, dense and smooth",washed,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,The Abbaye de Citeaux cheese comes from the Ci...,99999.0,,0,0
3,Abbaye de Timadeuc,pasteurized cow's milk,France,province of Brittany,semi-hard,soft,washed,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,"Being direct descendant of the Port du Salut, ...",99999.0,,0,0
4,Abbaye du Mont des Cats,pasteurized cow's milk,France,Nord-Pas-de-Calais,"semi-soft, artisan, brined",smooth and supple,washed,"milky, salty",floral,no,Abbaye du Mont des Cats,The Abbaye du Mont des Cats cheese is made by ...,50.0,,0,0
