<h1>Load the DataFrame</h1>

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display

PATH = "/home/gak5128/Documents/KaggleProject/"
df = pd.read_csv(PATH + "train.csv", index_col="RecipeId")
df.target = df.Target.astype('category')
df.head()

Unnamed: 0_level_0,Target,Ingredients
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1
5106,7,whole kernel corn drain jalapeno chilies sour ...
5107,7,yellow peppers roasted red peppers green onion...
5108,7,avocado fresh lime juice chile powder tabasco ...
5109,7,black beans frozen chopped spinach frozen corn...
5110,1,bacon boneless beef chuck roast onions baby ca...


<h1>Output most common words by cuisine

In [10]:
import operator

ITEMS = 15
cuisines=["","French","Italian","Indian","Chinese","Thai","Greek","Mexican"]
v = CountVectorizer(ngram_range=(1,3),max_features=ITEMS)
for i in range(1,8):
    df_x = v.fit_transform(df[df.Target == i].Ingredients.values.astype('U'))
    print("\nTop",ITEMS,"Ingredients for",cuisines[i],"Recipes")
    n = 1
    for ingr in sorted(v.vocabulary_.items(), key=operator.itemgetter(1)):
        print(n,ingr[0])
        n += 1


Top 15 Ingredients for French Recipes
1 butter
2 cheese
3 cream
4 eggs
5 flour
6 french
7 fresh
8 garlic
9 ground
10 oil
11 olive
12 olive oil
13 pepper
14 salt
15 sugar

Top 15 Ingredients for Italian Recipes
1 black
2 cheese
3 chicken
4 fresh
5 garlic
6 ground
7 italian
8 oil
9 olive
10 olive oil
11 parsley
12 pepper
13 red
14 salt
15 tomatoes

Top 15 Ingredients for Indian Recipes
1 chili
2 coriander
3 cumin
4 fresh
5 garlic
6 ginger
7 ground
8 masala
9 oil
10 onions
11 pepper
12 powder
13 red
14 salt
15 seeds

Top 15 Ingredients for Chinese Recipes
1 chicken
2 fresh
3 garlic
4 ginger
5 oil
6 onions
7 pepper
8 rice
9 salt
10 sauce
11 sesame
12 sesame oil
13 soy
14 soy sauce
15 sugar

Top 15 Ingredients for Thai Recipes
1 chicken
2 coconut
3 fish
4 fish sauce
5 fresh
6 garlic
7 lime
8 oil
9 pepper
10 red
11 rice
12 salt
13 sauce
14 sugar
15 thai

Top 15 Ingredients for Greek Recipes
1 black
2 cheese
3 feta
4 feta cheese
5 fresh
6 garlic
7 ground
8 lemon
9 oil
10 olive
11 olive oil
1

<h1>Order ingredients by the [number of times they appear in a recipe] times [inverse of the number of recipes they appear in]</h1>

In [11]:
ITEMS = 50
v = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,3),max_features=ITEMS)
df_x = v.fit_transform(df.Ingredients.values.astype('U'))
idf = v.idf_

topIngredients = pd.DataFrame({"Ingredient": v.get_feature_names(),"TfIdf":idf})
print("Top",ITEMS,"Ingredients")
topIngredients.sort_values(by="TfIdf",ascending=False)

Top 50 Ingredients


Unnamed: 0,Ingredient,TfIdf
39,sesame,3.315497
23,leaf,3.184574
24,leaves,3.178932
2,boneless,3.169029
45,vegetable oil,3.133259
10,coriander,3.126255
49,wine,3.124205
14,eggs,3.116451
25,lemon,3.058413
12,cream,3.030796


<h1>Perform same analysis after stemming words</h1>

In [12]:
import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('punkt') # Download the required nltk data files

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

ITEMS = 50
v = TfidfVectorizer(sublinear_tf=True, tokenizer=tokenize, stop_words='english', ngram_range=(1,3),max_features=ITEMS)
df_x = v.fit_transform(df.Ingredients.values.astype('U'))
idf = v.idf_

topIngredients = pd.DataFrame({"Ingredient": v.get_feature_names(),"TfIdf":idf})
print("Top",ITEMS,"Ingredients")
topIngredients.sort_values(by="TfIdf",ascending=False)

[nltk_data] Downloading package punkt to /home/gak5128/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Top 50 Ingredients


Unnamed: 0,Ingredient,TfIdf
39,seed,3.256742
25,leav,3.136986
45,veget oil,3.133259
11,coriand,3.126255
49,wine,3.124205
26,lemon,3.058413
13,cream,3.029677
6,chicken breast,3.007561
28,milk,2.998127
17,flour,2.996322
