In [None]:
import pandas as pd
import json
import collections as cl
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from gensim.utils import simple_preprocess
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import xgboost
import warnings

In [None]:
df = pd.read_json('../input/train.json')
df

In [None]:
missing_values=pd.isnull(df)
print("MISSING VALUES IN ID: ",missing_values.id.unique())
print("MISSING VALUES IN CUISINE:",missing_values.cuisine.unique())
print("MISSING VALUES IN INGREDIENTS: ",missing_values.ingredients.unique())
print("------------------------------------------------------------")
print("DATE TYPE OF ID:",df.id.dtype)
print("DATE TYPE OF Cuisine:",df.cuisine.dtype)
print("DATE TYPE OF Ingredients:",df.ingredients.dtype)

In [None]:
cnt = cl.Counter()
for i in df['ingredients']:
    for j in i:
        cnt[j] +=1
list_keyval =  sorted(cnt.items(), key=lambda pair: pair[1], reverse=True)
top_10_ingredients = list_keyval[:10]
keys = []
values = []
for i in top_10_ingredients:
    keys.append(i[0])
    values.append(i[1])
print(top_10_ingredients)
explode = (0.1, 0, 0, 0,0, 0, 0,0, 0, 0)  # explode 1st slice
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(aspect="equal"))
plt.pie(values,explode = explode, labels=keys,
        autopct='%1.1f%%', shadow=True, startangle=140)
# add a circle at the center
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
ax.set_title("Plot for Top 10 ingredients used")
plt.show()
print('Top 5 Cuisines')
count_cu = cl.Counter()
for i in df['cuisine']:
    count_cu[i]+=1
len(count_cu)
list_cui =  sorted(count_cu.items(), key=lambda pair: pair[1], reverse=True)
list_cui = list_cui[:5]
keys_cui = []
values_cui = []
for i in list_cui:
    keys_cui.append(i[0])
    values_cui.append(i[1])
print(keys_cui)
print(values_cui)

In [None]:
df['cuisine'].value_counts().plot(kind='bar')

plt.xlabel("CUISINES")
plt.ylabel("FREQUENCY OF CUISINES")
plt.show()
df['cuisine'].value_counts()

In [None]:
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
df.cuisine.str.lower()
def process(x):
    k = []
    l1 = []
    for ele in x:
        k.append(ele.split(" "))
    for elem in k:
        c = []
        for i in elem:
            c.append(lemmatizer.lemmatize(i))
        l1.append(" ".join(c))
    return l1

l = []
for i in range(0,len(df.ingredients)):
    filtered_ingredients = [w for w in df.ingredients[i] if not w in stop_words]
    filtered_ingredients = [strip_punctuation(w) for w in filtered_ingredients]
    filtered_ingredients = [w.lower() for w in filtered_ingredients if not w.isdigit()]
    si = process(filtered_ingredients)
    corpus = {'id': int(df.id[i]), 'cuisine': df.cuisine[i], 'ingredients': df.ingredients[i], 'stemmedingredients': si}
    l.append(corpus)
with open('cleaned_train.json', mode='w') as json_file:
    json.dump(l, json_file)

In [None]:
df_clean = pd.read_json('cleaned_train.json')
df_clean

In [None]:
cuisine = pd.unique(df_clean.cuisine)
cuisine_ingredients = []
freq = []
temp = []
for cs in cuisine:
    ingredient = []
    for c in range(len(df_clean.cuisine)):
        if df_clean.cuisine[c]==cs:
            ingredient.extend(df_clean.stemmedingredients[c])
    cuisine_ingredients.append(ingredient)
    freq.append(len(set(ingredient)))

In [None]:
warnings.filterwarnings('ignore')
plt.figure(figsize = (12, 8))
g = sns.barplot(cuisine, freq, palette = ["blue"], alpha = 0.7)
g.set_xticklabels(cuisine, rotation = 90, fontsize = 10, fontname = 'DejaVu Sans')
g.set(xlabel = 'Cuisines', ylabel = 'Number of different Ingredients')
g.set_title('Number of distinct Ingredients used in different Cuisines',fontname = 'fantasy', fontsize = 'xx-large', color = 'midnightblue', fontweight = 'bold')
plt.show()

In [None]:
cuisine_recipes = df_clean.cuisine.value_counts()    
freq_list = []
i=0;
for ti in cuisine_ingredients:
    ti_df = pd.Series(ti)
    f = ti_df.value_counts()
    freq_list.append(f.divide(cuisine_recipes[cuisine[i]]))
    i+=1

In [None]:
i=0
g1 = []
g2 = []
for fl in freq_list:
    print("\nCuisine: ", cuisine[i].upper(),'\n')
    g1.append(list(fl[0:5]))
    g2.append(list(fl.index[0:5]))
    for j in range(5):
        print(j+1,'.', fl.index[j].upper())
    i+=1

In [None]:
warnings.filterwarnings('ignore')
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
plt.figure(figsize = (20, 20))
axes = plt.subplot(5, 4, 20)
for j in range(20):
    plt.subplot(5, 4, j+1)
    g=sns.barplot([100*x for x in g1[j]], g2[j], color = "yellowgreen", alpha = 0.3)
    g.set(yticklabels = [])
    for i in range(5):
        g.text(5, i+0.1, s = g2[j][i], fontname = 'fantasy', color = "midnightblue", fontstyle = 'oblique', fontsize = 'x-large')
    g.set_title(cuisine[j].upper(), fontsize = 15, fontname = 'fantasy', fontweight = "bold")
plt.suptitle("MOST USED INGREDIENTS", fontname = 'fantasy', fontsize = 'xx-large', color = 'mediumvioletred', fontweight = 'bold')
plt.show()

In [None]:
t = pd.DataFrame(pd.concat(freq_list))
usage = t.groupby(t.index).sum()
for i in range(len(freq_list)):
    for j in range(len(freq_list[i])):
        if freq_list[i][j] >=0.1:
            d = float(usage.loc[freq_list[i].index[j]])
            freq_list[i][j] = freq_list[i][j]/d
        else:
            freq_list[i][j] = 0
for i in range(len(freq_list)):
    freq_list[i] = 100*(freq_list[i].sort_values(ascending = False))
freq_list

In [None]:
i=0
g1 = []
g2 = []
for fl in freq_list:
    print("\nCuisine: ", cuisine[i].upper(),'\n')
    g1.append(list(fl[0:5]))
    g2.append(list(fl.index[0:5]))
    for j in range(5):
        print(j+1,'.', fl.index[j].upper())
    i+=1

In [None]:
warnings.filterwarnings('ignore')
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
plt.figure(figsize = (20, 20))
axes = plt.subplot(5, 4, 20)
for j in range(20):
    plt.subplot(5, 4, j+1)
    g=sns.barplot(g1[j], g2[j], color = "mediumvioletred", alpha = 0.3)
    g.set(yticklabels = [])
    for i in range(5):
        g.text(5, i+0.1, s = g2[j][i], fontname = 'fantasy', color = "midnightblue", fontstyle = 'oblique', fontsize = 'x-large')
    g.set_title(cuisine[j].upper(), fontsize = 15, fontname = 'fantasy', fontweight = "bold")
plt.suptitle("MOST CUISINE-SPECIFIC INGREDIENTS", fontname = 'fantasy', fontsize = 'xx-large', color = 'mediumvioletred', fontweight = 'bold')
plt.show()

In [None]:
unique_ingredients=[]
for i in range(0,len(df_clean['stemmedingredients'])):
    unique_ingredients.extend(df_clean['stemmedingredients'][i])
    unique_ingredients=list(set(unique_ingredients))

**Data Transformation using TfidfVectorizer**

In [None]:
docs = []
for ingredients in df_clean.stemmedingredients:
    str1 = ""
    for ingredient in ingredients:
        str1 = str1 + ingredient + " "
    docs.append(str1)

In [None]:
vectorizer = TfidfVectorizer()
X_transformed = vectorizer.fit_transform(docs)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, df.cuisine, test_size = 0.30)

In [None]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='linear')  
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)  
print("SVM ACCURACY:",accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

In [None]:
def model_train(classifier, train_vector, label, train_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(train_vector, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(train_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return accuracy_score(predictions, y_test)

In [None]:
accuracy = model_train(naive_bayes.MultinomialNB(fit_prior = False), X_train, y_train, X_test)
print("Naive Baye's Accuracy ", accuracy)

In [None]:
forest=RFC(n_estimators=500, max_features = 10)
forest.fit(X_train,y_train)
output=forest.predict(X_test)
print("RANDOM FORESTS ACCURACY:",accuracy_score(y_test, output))

In [None]:
warnings.filterwarnings('ignore')
accuracy = model_train(xgboost.XGBClassifier(), X_train.tocsc(), y_train, X_test.tocsc())
print ("XG Boost Accuracy: ", accuracy)

**Data Transformation using CountVectorizer**

In [None]:
vect = CountVectorizer(max_features = 1000)
train_ingr = df_clean['stemmedingredients']
docs = [' '.join(x) for x in train_ingr]
X_transformed = vect.fit_transform(docs)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, df.cuisine, test_size = 0.30)

In [None]:
forest = RFC(n_estimators = 500)
forest = forest.fit(X_train, y_train)

In [None]:
y_pred = forest.predict(X_test)
print("Random Forest Accuracy using CountVectorizer-",100 * accuracy_score(y_test, y_pred),"%")

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Neural Net accuracy using CountVectorizer:",100 * accuracy_score(y_test, y_pred),'%')

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
svclassifier = SVC(kernel='linear')  
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)

In [None]:
print("Accuracy using SVM: ",accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

**Data Transformation using Doc2Vec**

In [None]:
recipes = []
for recipe in df_clean.stemmedingredients:
    s = ""
    for r_ingredient in recipe:
        s = s+' '+r_ingredient
    recipes.append(s)
recipes = pd.Series(recipes)
recipes = recipes.rename("ingredients")
data = pd.DataFrame(df_clean.cuisine)
data = data.join(recipes)
data

In [None]:
def tag_docs(docs):
    tagged = docs.apply(lambda r: TaggedDocument(words=simple_preprocess(r['ingredients']), tags=[r.cuisine]), axis=1)
    return tagged

In [None]:
def train_doc2vec_model(tagged_docs):
    sents = tagged_docs.values
    doc2vec_model = Doc2Vec(sents, vector_size=650, window=1, epochs=20, dm=1, worker = 4)
    return doc2vec_model

In [None]:
def vec_for_learning(doc2vec_model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=200)) for doc in sents])
    return targets, regressors

In [None]:
train_data, test_data = train_test_split(data, test_size = 0.3)
train_tagged = tag_docs(train_data)
test_tagged = tag_docs(test_data)    
model = train_doc2vec_model(train_tagged)
y_train, X_train = vec_for_learning(model, train_tagged)
y_test, X_test = vec_for_learning(model, test_tagged)

In [None]:
logreg = LogisticRegression(solver = 'newton-cg', multi_class = 'multinomial')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Accuracy for Logistic Regression: ", accuracy_score(y_test, y_pred) * 100, "%")

In [None]:
print("Logistic Regression")
print(classification_report(y_test,y_pred)) 

In [None]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("Accuracy for SVM: ", accuracy_score(y_test, y_pred) * 100, "%")

In [None]:
print("Support Vector Machine")
print(classification_report(y_test,y_pred)) 

In [None]:
forest = RFC(n_estimators = 1000, max_features = 'log2', criterion = 'gini')
forest.fit(X_train,y_train)
y_pred = forest.predict(X_test)
print("Accuracy for Random Forest: ", accuracy_score(y_test, y_pred) * 100, "%")

In [None]:
print("Random Forest Classifier")
print(classification_report(y_test,y_pred)) 