# Foodie Project

**Business case:** There will be a foodie fair in your city next summer, and the organsers have contacted us because they are interested in performing an analysis of the world wide cuisines. They manage to obtain a dataset from different cuisines all over the world, together with the list of the most common ingredients.

Some of our strongest geographic and cultural associations are tied to a region's local foods, so they are interested to know more information regarding them in order to organise the different stands.

In [1]:
import json
import numpy as np 
import pandas as pd 

# text processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# Other imports
from collections import Counter
import requests
import imageio
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from matplotlib import pyplot as plt
from PIL import Image


In [10]:
import sys
sys.path.insert(0, 'D:\Data Course\Projects\Food\Food-Project')

from my_functions_EDA import *
from fx_MLRegression import *

SyntaxError: invalid syntax (my_functions_EDA.py, line 493)

In [None]:
# Add the name of the file
data=json.load(open(r'../dataset/Ex 6.1. foodie_dataset.json'))

# and load it
#file = open(filename, 'rt')
#text = file.read()
#file.close()



In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.ingredients

In [None]:
df.cuisine.value_counts()

In [None]:
#Missing values in training set
print(df.isnull().sum())
print(df.info())

# Exploring cuisine column

In [None]:
sns.set(rc={'figure.figsize':(30,15),
            'axes.labelsize': 15, 
            'font.size': 15, 
            'legend.fontsize': 12, 
            'axes.titlesize':15,
            'figure.facecolor': 'white',
            'font.family': ['sans-serif'],
            'legend.fancybox': True,
            'lines.color': 'C0',
            'xaxis.labellocation': 'center',
            'xtick.alignment': 'center',
            'legend.title_fontsize': 15,
            'legend.edgecolor': '0.9',
            'animation.ffmpeg_path': 'ffmpeg',
           })    

sns.barplot(df['cuisine'].value_counts().index,
            df['cuisine'].value_counts(),palette='viridis')

In [None]:
df.head()

In [None]:
df['clean'] = df.ingredients.apply(', '.join)
df['clean'] = df['clean'].str.replace(',',' hola')

In [None]:
type(df.clean[0])

In [None]:
from collections import Counter

vocab = Counter()
for recipe in df['clean']:
    for word in recipe.split('hola'):
        vocab[word] += 1

In [None]:
vocab.most_common(100)

In [None]:
count_vectorizer = CountVectorizer(max_features = 1500)
train_vectors = count_vectorizer.fit_transform(df['clean'])
#pred_vectors = count_vectorizer.transform(pred["text2"])
train_vectors
#pred_vectors

In [None]:
df.clean

In [None]:
# No need to remove stopwords here because the TfidfVectorizer we'll do it for us
def clean_text(text):
    words = [word.lower() for word in text if word.isalpha()]
    return (" ").join(words)

In [None]:
def tokenize(text):
    tokens=re.split('\W+',text)
    return tokens 

In [None]:
df['ingredients'] = df['ingredients'].apply(lambda x: clean_text(x))

In [None]:
df['tokenized_text']=df['clean'].apply(lambda row : tokenize(row.lower()))
df.head()

In [None]:
def remove_stopwords(text):
    stopwords=nltk.corpus.stopwords.words('english')

    clean_text=[word for word in text if word not in stopwords]
    return clean_text 


In [None]:
df['w_stop']=df['tokenized_text'].apply(lambda row : remove_stopwords(row))

In [None]:
df

In [None]:
import nltk


from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatizer2(tokenized_text):
    lematize_text=[lemmatizer.lemmatize(word) for word in tokenized_text]
    return lematize_text

In [None]:
df['lemmatizer_text']=df['w_stop'].apply(lambda row : lemmatizer2(row))
df.head()

In [None]:
type(df.lemmatizer_text[35563])

In [None]:
df['final'] = df.lemmatizer_text.apply(' '.join)


In [None]:
df[df.final.str.contains('chicken')]

In [None]:
df['final']= df['final'].str.replace('hola' ,',')
df.head()

In [None]:
df.info()

In [None]:
data= df.copy()

In [None]:
data.drop(axis = 1, columns=["clean", "tokenized_text","w_stop","lemmatizer_text"], inplace= True)

In [None]:
data.head()

In [None]:
data.to_csv('Final_DATA.csv')

# Working with the new Data 

In [None]:
filename = r'Final_DATA.csv'
df_1 = pd.read_csv(filename)

In [None]:
# checking shape ...
print("The dataset has {} rows and {} columns.".format(*df_1.shape))

# ... and duplicates
print("It contains {} duplicates.".format(df_1.duplicated().sum()))

In [None]:
df_1.drop(axis = 1, columns=['Unnamed: 0'], inplace= True)

df_1.cuisine.value_counts()

In [None]:
df_1.cuisine.unique()

## Seeing the Lenght of the Recipes

In [None]:
df_1

In [None]:
df_1['Len'] = df_1['final'].apply(lambda row: len(row.split(', ')))

In [None]:
df_1.Len.value_counts().sort_values(ascending=False)

In [None]:
sns.set(rc={'figure.figsize':(15,7),
            'axes.labelsize': 15, 
            'font.size': 20, 
            'legend.fontsize': 12, 
            'axes.titlesize':15,
            'figure.facecolor': 'white',
            'font.family': ['sans-serif'],
            'legend.fancybox': True,
            'lines.color': 'C0',
            'xaxis.labellocation': 'center',
            'xtick.alignment': 'center',
            'legend.title_fontsize': 15,
            'legend.edgecolor': '0.9',
            'animation.ffmpeg_path': 'ffmpeg',
           })    

sns.barplot(df_1['Len'].value_counts().index,
            df_1['Len'].value_counts(),palette='inferno').set(title='Len Distribution')

In [None]:
a = df_1.groupby(['cuisine']).agg({'Len': [np.count_nonzero,np.min,np.max,np.mean,]})
a

### Reduce the Size

In [None]:
df_less = df_1[df_1.Len < 16]
print("We have", df_less.shape[0] , "recipes with less than 16 elements, and We had " , df_1.shape[0])

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(8,5))

sns.displot(df_less.Len[df_1.cuisine == "italian"], kde=True, bins =15, color='green')
plt.legend([ 'Italian Food'])
sns.displot(df_less.Len[df_1.cuisine == "mexican"], kde=True, bins =15,  color='coral')
plt.legend(['Mexican Food'])
sns.displot(df_less.Len[df_1.cuisine == "southern_us"], kde=True, bins =15, color='blue')
plt.legend([ 'Southern US Food'])

plt.title('Distribution Plot for Length of Comments\n')

plt.xlabel('\nRecipe Length')
plt.ylabel('Percentage of Comments\n');

In [None]:
df_less.groupby(['cuisine']).agg({'Len': [np.min,np.max,np.mean, np.count_nonzero]} )

In [None]:
df_less[df_less.Len < 4]

## Filtering Meat - NoMeat // Sugar - NotSugar

In [None]:
import re
def pattern_searcher(search_str:str, search_list:str):

    search_obj = re.search(search_list, search_str)
    if search_obj :
        return_str = search_str[search_obj.start(): search_obj.end()]
    else:
        return_str = 'NA'
    return return_str


In [None]:
meat = [' liver ', 'thigh', 'shrimp' , 'steak',  'chorizo', 'sausage', 'pork', 'kosher', 'turkey', 'fillet', 
        'fryer',  'fish', 'loin',  'serrano', 'crab', 'beef',  'burger', 'blood', 'prawn',  'chop', 'cod',  
        'duck',  'lamb', 'oyster', 'tenderloin', 'lard', 'cutlet', 'clam', 'heart', 'rabe', 'shell',  'meat',  
        'salmon', 'ear', 'trout',  "chicken", "pepperoni", "bacon", " ham " ,"veal",  "salami" ,"drumstick" , 
        "breast" ,"goose", "burger",  "lobster", "poultry", "tripe","mutton"]


pattern = '|'.join(meat)

In [None]:
sweet = ['sugar', "chocolate", 'honey' ]

pattern2 = '|'.join(sweet)

In [None]:
df_less['meat'] = df_1['final'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern))


In [None]:
a = [df_less.meat.value_counts()]
a = a[1:11]
a

In [None]:
sns.set(rc={'figure.figsize':(15,10),
            'axes.labelsize': 15, 
            'font.size': 20, 
            'legend.fontsize': 12, 
            'axes.titlesize':15,
            'figure.facecolor': 'white',
            'font.family': ['sans-serif'],
            'legend.fancybox': True,
            'lines.color': 'C0',
            'xaxis.labellocation': 'center',
            'xtick.alignment': 'center',
            'legend.title_fontsize': 15,
            'legend.edgecolor': '0.9',
            'animation.ffmpeg_path': 'ffmpeg',
           })    

sns.barplot(df_less['meat'].value_counts().index[1:11],
            df_less['meat'].value_counts()[1:11],palette='viridis').set(title='Ten Meets most frequency')

In [None]:
df_less['sugar'] = df_1['final'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern2))



In [None]:
nomeat = df_less[df_less['meat'] == "NA"]
nomeat.head(3)

In [None]:
sns.set(rc={'figure.figsize':(30,15),
            'axes.labelsize': 15, 
            'font.size': 15, 
            'legend.fontsize': 12, 
            'axes.titlesize':15,
            'figure.facecolor': 'white',
            'font.family': ['sans-serif'],
            'legend.fancybox': True,
            'lines.color': 'C0',
            'xaxis.labellocation': 'center',
            'xtick.alignment': 'center',
            'legend.title_fontsize': 15,
            'legend.edgecolor': '0.9',
            'animation.ffmpeg_path': 'ffmpeg',
           })    

sns.barplot(nomeat['cuisine'].value_counts().index,
            nomeat['cuisine'].value_counts(),palette='viridis', ).set(title='Distribution of the NoMeet recipes by cuisine')

In [None]:
meat = df_less[df_less['meat'] != "NA"]
meat.head(3)

In [None]:
sns.set(rc={'figure.figsize':(30,15),
            'axes.labelsize': 15, 
            'font.size': 15, 
            'legend.fontsize': 12, 
            'axes.titlesize':15,
            'figure.facecolor': 'white',
            'font.family': ['sans-serif'],
            'legend.fancybox': True,
            'lines.color': 'C0',
            'xaxis.labellocation': 'center',
            'xtick.alignment': 'center',
            'legend.title_fontsize': 15,
            'legend.edgecolor': '0.9',
            'animation.ffmpeg_path': 'ffmpeg',
           })    

sns.barplot(meat['cuisine'].value_counts().index,
            meat['cuisine'].value_counts(),palette='viridis').set(title='Distribution of the Meet recipes by cuisine')

# Tagging the ingredients

In [None]:
import nltk
# nltk.download('averaged_perceptron_tagger')
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent



In [None]:
# Add a new column with the preprocess
df_less["prepros"] = df_less["final"].apply(lambda x: preprocess(x))


In [None]:
jj = []
for a in df_less["prepros"]:
    for row in a:
        if 'JJ' in row:
            jj.append(row[0])
jj

In [None]:
nn = []
for a in df_less["prepros"]:
    for row in a:
        if 'NN' in row:
            nn.append(row[0])
nn

In [None]:
vocabjj = Counter(jj)
vocabnn = Counter(nn)

In [None]:
vocabjj.most_common(15)

In [None]:
vocabnn.most_common(15)

In [None]:
df_less

# Searching for Marcks

In [None]:
import spacy
from spacy import displacy
from collections import Counter

## Spanish corpus (https://spacy.io/models/es#es_core_news_md)
# python -m spacy download es
import es_core_news_sm
nlp_sp = es_core_news_sm.load()
nlp_sp

## English corpus
#python -m spacy download en
import en_core_web_sm
nlp = en_core_web_sm.load()
nlp

In [None]:
df_less["prepros2"] = df_less["final"].apply(lambda x: nlp(x))


In [None]:
a=[]
for row in df_less["ingredients"]:
    doc = nlp(row)
    a += [(X.text, X.label_) for X in doc.ents]


In [None]:
a

In [None]:
print([(X) for X in a if 'ORG' in X])

# Vectorizer and Clustering

## Clustering without a Cuisine Type

In [None]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(df_less['final'])

train_vectors

a = train_vectors.todense()

unidades_datos = np.array(a)
unidades_datos


In [None]:
# I transform the array into a DF 

In [None]:
new_df = pd.DataFrame(unidades_datos)

In [None]:
new_df

In [None]:
from sklearn.cluster import KMeans
inertia =[]

In [None]:


###Static code to get max no of clusters

for i in range(1,10):
    kmeans = KMeans(n_clusters= i,random_state = 43)
    kmeans.fit(new_df)
    inertia.append(kmeans.inertia_)

In [None]:
from kneed import KneeLocator
kl = KneeLocator(range(1, 10), inertia, curve="convex", direction="decreasing")
print('The elbow point of your model is:', kl.elbow)

In [None]:
kmeansmodel = KMeans(n_clusters= 4, random_state=0)
y_kmeans= kmeansmodel.fit_predict(new_df)

In [None]:
df_less['cluster_id'] = y_kmeans
df_less

In [None]:
df_less['cluster_id'].value_counts()

In [None]:
c2= df_less[df_less['cluster_id'] == 2]

In [None]:
c2['cuisine'].value_counts()

In [None]:
c2.groupby(['cuisine'])["Len"].describe()

In [None]:
c0= df_less[df_less['cluster_id'] == 0]

In [None]:
c0.groupby(['cuisine'])["Len"].describe()

In [None]:
c0['cuisine'].value_counts()

In [None]:
c1= df_less[df_less['cluster_id'] == 1]

In [None]:
c1.groupby(['cuisine'])["Len"].describe()

In [None]:
c1= df_less[df_less['cluster_id'] == 1]
c2= df_less[df_less['cluster_id'] == 2]
c3= df_less[df_less['cluster_id'] == 3]
c0= df_less[df_less['cluster_id'] == 0]

In [None]:
wordcloud = WordCloud(max_font_size=200, max_words=20, background_color="white",
                      
                      width= 300, height = 200,
                      stopwords = stopwords.words('english')).generate(str(c0.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'gray' , random_state=17), '\nWords')
wordcloud = WordCloud(max_font_size=200, max_words=20, background_color="white",
                      
                      width= 300, height = 200,
                      stopwords = stopwords.words('english')).generate(str(c1.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'hot' , random_state=17), '\nWords')
wordcloud = WordCloud(max_font_size=200, max_words=20, background_color="white",
                      
                      width= 300, height = 200,
                      stopwords = stopwords.words('english')).generate(str(c2.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'winter' , random_state=17), '\nWords')
wordcloud = WordCloud(max_font_size=200, max_words=20, background_color="white",
                      
                      width= 300, height = 200,
                      stopwords = stopwords.words('english')).generate(str(c3.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'Greens' , random_state=17), '\nWords')

## Clustering with cuisine

In [None]:
from sklearn.preprocessing import LabelEncoder
# Step 1. Instantiate the model (label encoding)
lb_make = LabelEncoder() 

# Step 2. Fit the variable to the instatiated model
new_df['Country'] = lb_make.fit_transform(df_less['cuisine'])

new_df

In [None]:
new_df.Country.value_counts()

In [None]:
from sklearn import preprocessing 

X = new_df

scaler = preprocessing.MinMaxScaler()
#pd.read_csv()
scaler

df_sc = scaler.fit_transform(X)

df_sc

In [None]:
from sklearn.cluster import KMeans
inertia =[]

###Static code to get max no of clusters

for i in range(1,10):
    kmeans = KMeans(n_clusters= i,random_state = 43)
    kmeans.fit(new_df)
    inertia.append(kmeans.inertia_)

In [None]:
df_sc = pd.DataFrame(df_sc, columns = X.columns) 

In [None]:
from kneed import KneeLocator
kl = KneeLocator(range(1, 10), inertia, curve="convex", direction="decreasing")
print('The elbow point of your model is:', kl.elbow)

In [None]:
kmeansmodel = KMeans(n_clusters= 6, random_state=0)
y_kmeans= kmeansmodel.fit_predict(df_sc)

In [None]:
df_less['cluster_id'] = y_kmeans
df_less

In [None]:
c2.cuisine.value_counst()

# Topic Modeling

In [None]:
text_list = [i.split(' ') for i in df_less.final if i != ',']

In [None]:
text_list

In [None]:
text_list = [i for i in text_list if i != ',' or ', ']
#[variable_name for variable_name in original_list if condition

In [None]:
# Step 1. Build our own dictionary, and save it for future use
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, 
# where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(text_list)#
dictionary.save('dictionary.dict')
print (dictionary)

In [None]:
# Step 2. Vectorize the characters

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_list]


In [None]:
# Step 2.2. Create and save the gensim Corpus from the processed dictionary
corpora.MmCorpus.serialize('corpus_s.mm', doc_term_matrix)

print (len(doc_term_matrix))
print (doc_term_matrix[100])

In [None]:
# Step 3. Perform the LDA model
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, 
               passes=50)

In [None]:
ldamodel.print_topics(num_topics= 10, num_words = 19)

In [None]:
## Let's save the model for the future 
ldamodel.save('topic.model')
## load saved model
from gensim.models import LdaModel
model_loaded = LdaModel.load('topic.model')

In [None]:
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()

d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus_s.mm')
lda = gensim.models.LdaModel.load('topic.model')


data = pyLDAvis.gensim.prepare(lda, c, d)

In [None]:
data

# WordCloud

In [None]:
italian = df_less[df_less.cuisine == "italian"]
mexican = df_less[df_less.cuisine == "mexican"]
southern_us = df_less[df_less.cuisine == "southern_us"]
indian = df_less[df_less.cuisine == "indian"]
chinese = df_less[df_less.cuisine == "chinese"]
french = df_less[df_less.cuisine == "french"]
cajun_creole = df_less[df_less.cuisine == "cajun_creole"]
thai  = df_less[df_less.cuisine == "thai"]
japanese= df_less[df_less.cuisine == "japanese"]
greek = df_less[df_less.cuisine == "greek"]
spanish = df_less[df_less.cuisine == "spanish"]
korean = df_less[df_less.cuisine == "korean"]
vietnamese = df_less[df_less.cuisine == "vietnamese"]
moroccan = df_less[df_less.cuisine == "moroccan"]
british = df_less[df_less.cuisine == "british"]
filipino = df_less[df_less.cuisine == "filipino"]
irish  = df_less[df_less.cuisine == "irish"]
jamaican = df_less[df_less.cuisine == "jamaican"]
russian = df_less[df_less.cuisine == "russian"]
brazilian = df_less[df_less.cuisine == "brazilian"] 

In [None]:
ita = np.array(Image.open(requests.get('https://img2.freepng.es/20180404/azq/kisspng-italy-vector-map-the-seven-wonders-5ac55fcd89ce32.5470960615228845575645.jpg', stream=True).raw))
mex = np.array(Image.open(requests.get('https://cdn1.vectorstock.com/i/1000x1000/55/10/mexico-solid-black-silhouette-map-of-country-vector-21655510.jpg', stream=True).raw))
sou = np.array(Image.open(requests.get('https://st2.depositphotos.com/2567911/8292/v/950/depositphotos_82920302-stock-illustration-black-silhouette-map-of-united.jpg', stream=True).raw))


ind = np.array(Image.open(requests.get('https://w7.pngwing.com/pngs/693/612/png-transparent-india-blank-map-line-art-monochrome-india-world.png', stream=True).raw))
chi =  np.array(Image.open(requests.get('https://cdn2.vectorstock.com/i/1000x1000/63/06/map-of-china-vector-20546306.jpg', stream=True).raw))
#french = df_1[df_1.cuisine == "french"]
#cajun_creole = df_1[df_1.cuisine == "cajun_creole"]
#thai  = df_1[df_1.cuisine == "thai"]
#japanese= df_1[df_1.cuisine == "japanese"]
#greek = df_1[df_1.cuisine == "greek"]
#spanish = df_1[df_1.cuisine == "spanish"]
#korean = df_1[df_1.cuisine == "korean"]
#vietnamese = df_1[df_1.cuisine == "vietnamese"]
#moroccan = df_1[df_1.cuisine == "moroccan"]
#british = df_1[df_1.cuisine == "british"]
#filipino = df_1[df_1.cuisine == "filipino"]
#irish  = df_1[df_1.cuisine == "irish"]
#jamaican = df_1[df_1.cuisine == "jamaican"]
#russian = df_1[df_1.cuisine == "russian"]
#brazilian = = np.array(Image.open(requests.get('https://i.pinimg.com/originals/f2/94/4f/f2944f132ac16b8dedc74d3f77249420.jpg', stream=True).raw))

In [None]:
from wordcloud import WordCloud, STOPWORDS 

In [None]:
def plot_wordcloud(wordcloud, language):
    plt.figure(figsize=(12, 10))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis("off")
    plt.title(language + ' Comments\n', fontsize=18, fontweight='bold')
    plt.show()

In [None]:
wordcloud = WordCloud(max_font_size=200, max_words=50, background_color="white",
                      mask=ita ,width= 3000, height = 2000,
                      stopwords = stopwords.words('english')).generate(str(italian.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'RdYlGn' , random_state=17), '\nWords')

In [None]:
wordcloud = WordCloud(max_font_size=200, max_words=50, background_color="white",
                      mask=mex ,width= 3000, height = 2000,
                      stopwords = stopwords.words('english')).generate(str(mexican.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'YlGn' , random_state=17), '\nWords')

In [None]:
wordcloud = WordCloud(max_font_size=200, max_words=50, background_color="white",
                      mask=sou ,width= 3000, height = 2000,
                      stopwords = stopwords.words('english')).generate(str(southern_us.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'RdBu' , random_state=17), '\nWords')

In [None]:
wordcloud = WordCloud(max_font_size=200, max_words=50, background_color="white",
                      mask=ind ,width= 3000, height = 2000,
                      stopwords = stopwords.words('english')).generate(str(indian.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'RdYlGn' , random_state=17), '\nWords')

In [None]:
wordcloud = WordCloud(max_font_size=200, max_words=50, background_color="white",
                      mask=chi ,width= 3000, height = 2000,
                      stopwords = stopwords.words('english')).generate(str(chinese.final.values))

plot_wordcloud(wordcloud.recolor( colormap= 'hot' , random_state=17), '\nWords')

# Bag of words