# Analysis of cheese colors

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Load data

In [100]:
df = pd.read_csv("cheeses.csv")
df.head()

Unnamed: 0,cheese,url,milk,country,region,family,type,fat_content,calcium_content,texture,rind,color,flavor,aroma,vegetarian,vegan,synonyms,alt_spellings,producers
0,Aarewasser,https://www.cheese.com/aarewasser/,cow,Switzerland,,,semi-soft,,,buttery,washed,yellow,sweet,buttery,False,False,,,Jumi
1,Abbaye de Belloc,https://www.cheese.com/abbaye-de-belloc/,sheep,France,Pays Basque,,"semi-hard, artisan",,,"creamy, dense, firm",natural,yellow,burnt caramel,lanoline,True,False,Abbaye Notre-Dame de Belloc,,
2,Abbaye de Belval,https://www.cheese.com/abbaye-de-belval/,cow,France,,,semi-hard,40-46%,,elastic,washed,ivory,,aromatic,False,False,,,
3,Abbaye de Citeaux,https://www.cheese.com/abbaye-de-citeaux/,cow,France,Burgundy,,"semi-soft, artisan, brined",,,"creamy, dense, smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",False,False,,,
4,Abbaye de Tamié,https://www.cheese.com/tamie/,cow,France,Savoie,,"soft, artisan",,,"creamy, open, smooth",washed,white,"fruity, nutty","perfumed, pungent",False,False,,"Tamié, Trappiste de Tamie, Abbey of Tamie",


## Prepare data

In [None]:
#clean and standardise countries
dict = {"England, Great Britain, United Kingdom" : "United Kingdom",
        "England, Scotland, United Kingdom": "United Kingdom",
        "Great Britain, United Kingdom, Wales": "United Kingdom",
        "Great Britain, Scotland, United Kingdom": "United Kingdom",
        "England, Scotland, Wales": "United Kingdom",
        "United Kingdom, Wales": "United Kingdom",
        "Scotland, United Kingdom": "United Kingdom",
        "Great Britain": "United Kingdom",
        "Wales": "United Kingdom",
        "Scotland": "United Kingdom",
        "England": "United Kingdom",
        "Holland" : "Netherlands",
        "Cyprus, Egypt, Israel, Jordan, Lebanon, Middle East": "Cyprus, Egypt, Israel, Jordan, Lebanon",
        "Lebanon, Middle East": "Lebanon",
        "Middle East" : "",
        "Mexico and Caribbean": "Mexico",
        "China, Nepal, Tibet": "China",
        "China, Tibet": "China",
        "United Kingdom, United Kingdom": "United Kingdom"
}

df["country_dv"] = df["country"]
for key, value in dict.items():
    df["country_dv"] = df["country_dv"].str.replace(key, value)
df.loc[df["country_dv"]=="", "country_dv"] = np.nan

#summarise countries into 'Multiple' and 'Other' for small producers
df["country_dv2"] = df["country_dv"]
df.loc[df["country_dv2"].str.contains(",")==True, "country_dv2"] = "Multiple"
less_than_five = df["country_dv2"].value_counts()[df["country_dv2"].value_counts() < 5].index
df.loc[df["country_dv2"].isin(less_than_five), "country_dv2"] = "Other"

df["country_dv2"].value_counts()

United States     305
France            169
United Kingdom    168
Italy             141
Canada             65
Australia          53
Ireland            36
Other              35
Multiple           35
Germany            25
Netherlands        25
Spain              24
Switzerland        16
Austria            14
Sweden              9
Belgium             8
Mexico              8
Denmark             7
New Zealand         7
Portugal            7
Greece              6
India               6
Brazil              5
Name: country_dv2, dtype: int64

In [159]:
#clean colors and combine
df["color_dv"] = df["color"].str.replace("red", "orange").str.replace("pink and white", "white").str.replace("blue-grey", "blue")

color_hex = {'yellow' : "#FFE714",
            'ivory' : "#FFFFE5",
            'white' : "#FFFFFF",
            'pale yellow' : "#FFEE8F",
            'blue' : "#2C6682",
            'orange' : "#FFA333",
            'cream' : "#FFFDC7",
            'brown' :"#C58A43",
            'green': "#668655" ,
            'golden yellow': "#FFD000",
            'pale white': "#F2F2F2",
            'straw' : "#E4D96F",
            'brownish yellow': "#E2B95A",
            'golden orange': "#FFB400",
            }

color_group = {'yellow' : "yellow",
            'ivory' : "white",
            'white' : "white",
            'pale yellow' : "light yellow",
            'blue' : "other",
            'orange' : "orange/brown",
            'cream' : "light yellow",
            'brown' :"orange/brown",
            'green': "other" ,
            'golden yellow': "yellow",
            'pale white': "white",
            'straw' : "yellow",
            'brownish yellow': "yellow",
            'golden orange': "orange/brown",
            }

df["color_hex"] = df["color_dv"].map(color_hex)
df["color_group"] = df["color_dv"].map(color_group)
df["color_group"].value_counts()

white           434
light yellow    356
yellow          191
orange/brown     37
other            27
Name: color_group, dtype: int64

## Explore