In [61]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import plotly.express as px

import re # regular expression
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

import tensorflow as tf

In [62]:
os.chdir('/content/drive/MyDrive')
os.getcwd()

'/content/drive/MyDrive'

In [87]:
data = pd.read_csv('dataset/ramen-ratings.csv')

In [88]:
data

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,
...,...,...,...,...,...,...,...
2575,5,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5,
2576,4,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1,
2577,3,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2,
2578,2,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2,


In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580 entries, 0 to 2579
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Review #  2580 non-null   int64 
 1   Brand     2580 non-null   object
 2   Variety   2580 non-null   object
 3   Style     2578 non-null   object
 4   Country   2580 non-null   object
 5   Stars     2580 non-null   object
 6   Top Ten   41 non-null     object
dtypes: int64(1), object(6)
memory usage: 141.2+ KB


# Preprocessing: Remove Unnecessary Data

In [90]:
data.isna().sum() # This

Unnamed: 0,0
Review #,0
Brand,0
Variety,0
Style,2
Country,0
Stars,0
Top Ten,2539


In [91]:
data['Top Ten'].unique()

array([nan, '2016 #10', '2016 #1', '2016 #8', '2016 #5', '2016 #9',
       '2016 #7', '2015 #10', '2015 #7', '2015 #4', '2015 #9', '2015 #6',
       '2015 #1', '2013 #10', '2015 #8', '2014 #7', '2014 #4', '2014 #9',
       '2014 #10', '2014 #8', '2014 #5', '2014 #6', '2014 #1', '2013 #1',
       '2013 #2', '2013 #4', '\n', '2013 #9', '2013 #3', '2012 #10',
       '2012 #7', '2012 #5', '2012 #3', '2012 #6', '2012 #9', '2012 #1',
       '2012 #2', '2013 #6', '2012 #4'], dtype=object)

In [92]:
data['Top Ten'] = data['Top Ten'].replace('\n', np.NaN) # replace /n with null value (NaN)
data['isTopTen'] = data['Top Ten'].apply(lambda x: 0 if str(x) == 'nan' else 1) # repalce NaN with 0, else 1
data = data.drop('Top Ten', axis=1) # Remove the original Top Ten as it no longer useful
data = data.drop('Review #', axis=1) # Remove Review # as it is not useful

In [93]:
data.isna().sum()

Unnamed: 0,0
Brand,0
Variety,0
Style,2
Country,0
Stars,0
isTopTen,0


In [94]:
data = data.dropna(axis=0).reset_index(drop=True) # Drop any row that have any value

In [95]:
data

Unnamed: 0,Brand,Variety,Style,Country,Stars,isTopTen
0,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,0
1,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,0
2,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,0
3,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,0
4,Ching's Secret,Singapore Curry,Pack,India,3.75,0
...,...,...,...,...,...,...
2573,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5,0
2574,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1,0
2575,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2,0
2576,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2,0


Next: Turn Seperate Word into Features

In [97]:
# Understanding why Stars is a Object not int
data['Stars'].astype(np.float)

AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [98]:
# Check for Noise: Unrated data and Unusual data
data.query("Stars == 'Unrated'")

Unnamed: 0,Brand,Variety,Style,Country,Stars,isTopTen
32,Ottogi,Plain Instant Noodle No Soup Included,Pack,South Korea,Unrated,0
122,Samyang Foods,Sari Ramen,Pack,South Korea,Unrated,0
993,Mi E-Zee,Plain Noodles,Pack,Malaysia,Unrated,0


In [100]:
data['Stars'] = data['Stars'].replace('Unrated', np.NaN).astype(float) # Turn Object datatype to float for calc
data['Stars'] = data['Stars'].fillna(data['Stars'].mean())

In [101]:
# "missing values" refer to cells in your dataframe that do not contain any data.
print(f'Total Missing values: {data.isna().sum().sum()}')

Total Missing values: 0


# Engineerign Variety Features

In [103]:
ramen_names = data.loc[:, 'Variety'] # extract Variety col only
ramen_names # print extracted result

Unnamed: 0,Variety
0,T's Restaurant Tantanmen
1,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...
2,Cup Noodles Chicken Vegetable
3,GGE Ramen Snack Tomato Flavor
4,Singapore Curry
...,...
2573,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty..."
2574,Oriental Style Instant Noodles
2575,Tom Yum Shrimp
2576,Tom Yum Chili Flavor


In [114]:
# Steming each word to increase efficiency
# Stemming reduces words to their root form (e.g., "running" becomes "run")
ps = PorterStemmer()

def process_name(name):
  new_name = name.lower() # Make lowercase
  new_name = re.sub(r'[^a-z0-9\s]', '', new_name) # Remove punctuation
  new_name = re.sub(r'[0-9]+', 'number', new_name) # Change numerical word to "number"
  new_name = new_name.split(" ") # Make string into a list of words
  new_name = list(map(lambda x: ps.stem(x), new_name)) # stemmed each word
  new_name = list(map(lambda x: x.strip(), new_name)) # remove whitespace from each stemmed word

  # remove empty string if any existed
  if '' in new_name:
    new_name.remove('')
  return new_name

In [121]:
ramen_names = ramen_names.apply(process_name)
ramen_names

Unnamed: 0,Variety
0,"[ts, restaur, tantanmen]"
1,"[noodl, spici, hot, sesam, spici, hot, sesam, ..."
2,"[cup, noodl, chicken, veget]"
3,"[gge, ramen, snack, tomato, flavor]"
4,"[singapor, curri]"
...,...
2573,"[hu, tiu, nam, vang, phnom, penh, style, asian..."
2574,"[orient, style, instant, noodl]"
2575,"[tom, yum, shrimp]"
2576,"[tom, yum, chili, flavor]"


In [125]:
# Getting the number of unique words in our list of ramen names
vocabulary = set()

for name in ramen_names: # for each variety list
  for word in name: # for each ramen name in variety list
    if word not in vocabulary:
      vocabulary.add(word)

vocab_length = len(vocabulary)

# get the maximum length of a single ramen name
max_seq_length = max(ramen_names.apply(lambda x: len(x)))

In [129]:
print(f'  Vocabulary Length: {vocab_length} total unique word')
print(f'Max Sequence Length: {max_seq_length} is the maximum length of a ramen name')

  Vocabulary Length: 1372 total unique word
Max Sequence Length: 13 is the maximum length of a ramen name
