# Data Profiling with Python

In [1]:
# @BEGIN CS513-Team15-FinalProject
# @PARAM db_pth
# @PARAM fmodel
# @IN a
# @IN b
# @OUT result

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Check null columns

In [2]:
# @BEGIN simple_process
# @IN a
# @IN b
# @OUT result
def check_null_columns(file_name):
    df = pd.read_csv(file_name)
#     columns_with_all_null = df.columns[df.isnull().all()]
    print('###############################')
    print('{}'.format(file_name))
#     print('Drop columns where all values are NA {}'.format(columns_with_all_null.values))
#     df.drop(columns_with_all_null, axis='columns', inplace=True)

    ### Identify columns with too many missing values
    for col in df.columns:
        count_nan = df[col].isnull().sum()
        percentage_missing = count_nan / df.size
        print('{:30s}\t{}\t{:.2f}%\t({:.10f}%)'.format(col, count_nan, percentage_missing*100, percentage_missing*100))
    return df

menu = check_null_columns('Menu.csv')
menu_page = check_null_columns('MenuPage.csv')
menu_item = check_null_columns('MenuItem.csv')
dish = check_null_columns('Dish.csv')

# @END simple_process

###############################
Menu.csv
id                            	0	0.00%	(0.0000000000%)
name                          	14348	4.09%	(4.0884481678%)
sponsor                       	1561	0.44%	(0.4448053798%)
event                         	9393	2.68%	(2.6765259019%)
venue                         	9428	2.69%	(2.6864991167%)
place                         	9424	2.69%	(2.6853593207%)
physical_description          	2782	0.79%	(0.7927281017%)
occasion                      	13756	3.92%	(3.9197583633%)
notes                         	6932	1.98%	(1.9752664273%)
call_number                   	1562	0.45%	(0.4450903288%)
keywords                      	17547	5.00%	(5.0000000000%)
language                      	17547	5.00%	(5.0000000000%)
date                          	586	0.17%	(0.1669801106%)
location                      	0	0.00%	(0.0000000000%)
location_type                 	17547	5.00%	(5.0000000000%)
currency                      	11091	3.16%	(3.1603692939%)
currency_symbol               	1

# Remove un-necessary columns
We will remove un-ncessary columns so that we can do our operations more efficiently. One caveate is that dish_id has 0.00% null values, but actually it has null values (0.0020061582%). This rows may violate constraints or really do not have dish items. We are not sure about this, so we will remove these rows.

In [3]:
menu = menu.drop(['keywords', 'language', 'location_type', 'name', 'sponsor', 'event', 'venue',
                  'physical_description', 'occasion', 'notes', 'call_number', 'page_count', 'dish_count'], axis='columns')
menu_page = menu_page.drop(['page_number', 'image_id', 'full_height', 'full_width', 'uuid'], axis='columns')
menu_item = menu_item.drop(['high_price', 'created_at', 'updated_at', 'xpos', 'ypos'], axis='columns')
dish = dish.drop(['description', 'menus_appeared', 'times_appeared', 'first_appeared', 'last_appeared', 'lowest_price', 'highest_price'], axis='columns')

before = menu_item.shape[0]
menu_item.dropna(subset=['dish_id'], inplace=True)
after = menu_item.shape[0]
print('MenuItem na value delete: {}'.format(before-after))
menu_item["dish_id"] = menu_item["dish_id"].astype(np.int64)

menu.to_csv('./NYPL_only_necessary_columns/Menu.csv', index=False)
menu_page.to_csv('./NYPL_only_necessary_columns/MenuPage.csv', index=False)
menu_item.to_csv('./NYPL_only_necessary_columns/MenuItem.csv', index=False)
dish.to_csv('./NYPL_only_necessary_columns/Dish.csv', index=False)

MenuItem na value delete: 241


In [4]:
!rm -f NYPL_only_necessary_columns.zip
!zip NYPL_only_necessary_columns.zip ./NYPL_only_necessary_columns/*.csv

  adding: NYPL_only_necessary_columns/Dish.csv (deflated 63%)
  adding: NYPL_only_necessary_columns/Menu.csv (deflated 78%)
  adding: NYPL_only_necessary_columns/MenuItem.csv (deflated 71%)
  adding: NYPL_only_necessary_columns/MenuPage.csv (deflated 79%)
  adding: NYPL_only_necessary_columns/test.csv (deflated 1%)


# Make After_OpenRefine.zip in OpenRefine
Now we are ready to do OpenRefine operations with NYPL_only_necessary_columns.zip. Once OpenRefine job finished, export refined data to After_OpenRefine.zip file again in order to do more operations in Python.

# Read Cleaned Dataset from Ater_OpenRefine.zip

In [None]:
menu = pd.read_csv('After_OpenRefine/Menu.csv')
menu_page = pd.read_csv('After_OpenRefine/MenuPage.csv')
menu_item = pd.read_csv('After_OpenRefine/MenuItem.csv')
dish = pd.read_csv('After_OpenRefine/Dish.csv')

# Cicken detection

In [10]:
# chicken finding test
# Chicken: Chick, (In Franch) Poulette,  Coquille, Poulet, Poussin, Poularde, Coq, Geline, Supreme, Aileron, Cuisse
poulet = dish.name.str.contains('poulet')
dish.name[poulet]

1865                                          a la poulette
1969                                 Scallops a la poulette
1970               Scallops a la poulette in a chafing dish
2611                             Calf's head, a la poulette
2621                            Calf's brains a la poulette
                                ...                        
414600                           Ailes de poulet à la Patti
419786                      little neck clams à la poulette
420129               Chaud-froid de poulet à La Norvégienne
420191    poulet a la creole  - saute puis recouvert d'u...
420198                                      poulet sandwich
Name: name, Length: 280, dtype: object

In [11]:
# List of chicken dishes: https://en.wikipedia.org/wiki/List_of_chicken_dishes
chicken_dishes = pd.read_csv("chicken_dishes.txt")
chicken_idx = chicken_dishes["name"].str.lower().str.contains("chicken")
chicken_variants = chicken_dishes[~chicken_idx]
chicken_var_words = chicken_variants.name.str.lower().str.split(expand=True).stack().value_counts()
chicken_words = chicken_var_words[chicken_var_words > 1]
chicken_words = chicken_words.drop(['à', 'au', 'wing',
                                    'shish',    # skewer (a long piece of wood or metal used for holding pieces of food)
                                    'biryani',  # Indian fried rice
                                    'adobo',    # Philippines's cooking technique, not a dish name
                                    'nasi',     # Indonesian rice
                                    'kai',      # Thailand soup
                                    ], axis=0)  # delete the rows with some labels
print(chicken_words)
chicken_words = chicken_words.append(pd.Series({'chicken': 100, 'coq':1, 'poulet':1, 'poussin': 1, 'poulette': 1, 'karaage': 1, 'yassa': 1}))
chicken_words = chicken_words.sort_values(ascending=False)
#print(chicken_words)
# print(chicken_words.index.values)


ayam       14
pollo       3
manok       3
galinha     2
taouk       2
dtype: int64


In [12]:
# https://stackoverflow.com/questions/53350793/how-to-check-if-pandas-column-has-value-from-list-of-string
chicken_checks = dish.name.apply(lambda x: any([k in str(x) for k in chicken_words.index.values]))
chicken_rows = dish.name[chicken_checks == True]
chicken_rows.describe()

count                                   3715
unique                                  3715
top       Baked chicken pie, individual dish
freq                                       1
Name: name, dtype: object

In [13]:
chicken_rows.head()

679                              1/4 chicken
891                          Grilled chicken
1271    Broiled half spring chicken on toast
1301             Broiled spring chicken half
1303                           Roast chicken
Name: name, dtype: object

In [14]:
dish_after_openrefine = pd.read_csv('./After_OpenRefine/Dish.csv')

dish_after_openrefine["is_chicken"] = chicken_checks
dish_after_openrefine["is_chicken"] = dish_after_openrefine["is_chicken"].replace({True: 'Y', False: 'N'})

dish_after_openrefine.to_csv('./afterAddingCurrAndNyc/Dish.csv', index=False)

# NYC detection

In [15]:
# load from After_OpenRefine.zip
menu_after_openrefine = pd.read_csv('./After_OpenRefine/Menu.csv')


In [16]:
menu_after_openrefine["is_in_nyc"] = menu_after_openrefine['place'].str.lower().str.contains(r'\b(ny|nyc|new york)\b',regex=True) \
| menu_after_openrefine['location'].str.lower().str.contains(r'\b(ny|nyc|new york)\b',regex=True)
menu_after_openrefine["is_in_nyc"] = menu_after_openrefine["is_in_nyc"].replace({True: 'Y', False: 'N'})
menu_after_openrefine["is_in_nyc"].head(n=20)

  return func(self, *args, **kwargs)


0     N
1     N
2     N
3     N
4     N
5     N
6     Y
7     N
8     N
9     N
10    Y
11    Y
12    Y
13    N
14    N
15    N
16    N
17    N
18    N
19    N
Name: is_in_nyc, dtype: object

# Convert currency rates

In [17]:
#print(menu_after_openrefine.currency.value_counts())

currency_map = {
    'dollars': 1,
    'deutsche marks': 0.63,
    'francs': 0.18,
    'canadian dollars':0.793,
    'swiss francs':1.087,
    'shillings':0.085,
    'swedish kronor (sek/kr)':0.115,
    'italian lire':0.0006,
    'cents':0.01,
    'uk pounds':1.377,
    'belgian francs':0.029,
    'mexican pesos':0.05,
    'dutch guilders':0.535,
    'austrian schillings':0.085,
    'danish kroner':0.158,
    'yen':0.009,
    'pesetas':0.007,
    'euros':1.181,
    'escudos':0.005,
    'austro-hungarian kronen':0.021,
    'hungarian forint':0.003,
    'drachmas':0.003,
    'israeli lirot (1948-1980)':0.304,
    'norwegian kroner':0.114,
    'icelandic krónur':0.008,
    'quetzales':0.129,
    'argentine peso':0.01,
    'finnish markka':0.198,
    'lats':1.68,
    'sol':0.25,
    'cuban pesos':0.041,
    'złoty':0.257,
    'brazilian cruzeiros':0.00007131,
    'uruguayan pesos':0.022,
    'qatari riyal':0.274,
    'australian dollars':0.74,
    'new taiwan dollar':0.036,
    'bermudian dollars':1,
    'moroccan dirham':0.111,
    'monégasque francs':0.18,
    'pence':0.01387,
    'straits dollar (1904-1939)':0.256,
    np.nan: 1
}

menu_after_openrefine["to_dollar_rate"] = menu_after_openrefine.currency.apply(lambda x: currency_map[x])
menu_after_openrefine.to_csv('./afterAddingCurrAndNyc/Menu.csv', index=False)

In [18]:
# @END CS513-Team15-FinalProject