# Overview
This notebook prepares ramen-ratings.csv for exploration. The code here has been added to wrangle.py as a script.

# Implementation

In [1]:
# imports
import pandas as pd

df = pd.read_csv('ramen-ratings.csv')
df.head(3)

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,


In [2]:
# drop unrated ramen
df = df.drop(df.loc[df.Stars == 'Unrated'].index)

In [3]:
# cast rating column as float
df['Stars'] = df.Stars.astype('float')

In [4]:
# new column for 5-star ramen
df['five_stars'] = df['Stars'] == 5

In [5]:
# show work
df['five_stars'].value_counts()

False    2191
True      386
Name: five_stars, dtype: int64

In [6]:
# drop review num, stars, and top ten columns
df = df.drop(columns=['Review #','Stars','Top Ten'])

In [7]:
# check nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2577 entries, 0 to 2579
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Brand       2577 non-null   object
 1   Variety     2577 non-null   object
 2   Style       2575 non-null   object
 3   Country     2577 non-null   object
 4   five_stars  2577 non-null   bool  
dtypes: bool(1), object(4)
memory usage: 103.2+ KB


In [8]:
# drop nulls in style
df = df.drop(df[df.Style.isna()].index)

In [9]:
# drop duplicates
df = df.drop_duplicates()

In [10]:
# check work
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2564 entries, 0 to 2579
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Brand       2564 non-null   object
 1   Variety     2564 non-null   object
 2   Style       2564 non-null   object
 3   Country     2564 non-null   object
 4   five_stars  2564 non-null   bool  
dtypes: bool(1), object(4)
memory usage: 102.7+ KB


In [11]:
# rename columns
df = df.rename(columns={'Brand':'brand', 'Variety':'product', 'Style':'package', 'Country':'country'})

In [12]:
# check package for low-count values
print(df.package.value_counts())

Pack    1522
Bowl     479
Cup      447
Tray     108
Box        6
Can        1
Bar        1
Name: package, dtype: int64


In [13]:
# drop low-count values (8 rows)
mask = (df.package == 'Box') | (df.package == 'Can') | (df.package == 'Bar')
df = df[~mask]

In [14]:
# check country for low-count values
df.country.value_counts()

Japan            348
USA              319
South Korea      306
Taiwan           223
Thailand         189
China            167
Malaysia         152
Hong Kong        137
Indonesia        124
Singapore        109
Vietnam          108
UK                69
Philippines       47
Canada            41
India             29
Germany           27
Mexico            25
Australia         22
Netherlands       15
Myanmar           14
Nepal             14
Pakistan           9
Hungary            9
Bangladesh         7
Colombia           6
Brazil             5
Cambodia           5
Fiji               4
Holland            4
Poland             4
Finland            3
Sarawak            3
Sweden             3
Dubai              3
Ghana              2
Estonia            2
Nigeria            1
United States      1
Name: country, dtype: int64

In [15]:
# merge United States and USA
index_loc = df.loc[df.country == 'United States'].index.item() # get index
df.loc[index_loc, 'country'] = 'USA'

In [16]:
# get indices of countries with less than 5 cumulative rows in dataframe
low_count_countries = df.country.value_counts()[df.country.value_counts() < 5].index.tolist() # get country names
low_count_indices = [] # init empty index list
for cntry in low_count_countries:  # go by each country
    low_count_indices.extend(df[df.country == cntry].index.tolist()) # add each index of matching country to list

# check work
print(low_count_indices)
print('Number of rows to drop:', len(low_count_indices))

[1574, 1592, 1629, 1637, 1568, 1587, 2036, 2052, 184, 194, 1681, 1683, 1026, 1063, 1093, 863, 872, 883, 675, 816, 856, 1783, 1838, 1860, 26, 44, 1480, 1481, 78]
Number of rows to drop: 29


In [17]:
# drop low-count countries
df = df.drop(low_count_indices)

In [18]:
# check work
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2527 entries, 0 to 2579
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   brand       2527 non-null   object
 1   product     2527 non-null   object
 2   package     2527 non-null   object
 3   country     2527 non-null   object
 4   five_stars  2527 non-null   bool  
dtypes: bool(1), object(4)
memory usage: 101.2+ KB
