In [2]:
################################ Setup #####################################

model_name = "model_1_2"

import pickle
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import pprint
import sklearn.preprocessing as pp
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option("max_colwidth",500)
pd.set_option("display.max_rows",None)

df = pickle.load(open("df_bckup3.p","rb"))

In [3]:
############################## Preprocessing functions #####################
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def clean(l):
    return [[re.sub(r"^\s",r"",re.sub(r"\W"," ",re.sub(r"(.*)((.*)oz.(.*)\))(.*)",r"\1\5",i))).lower() for i in r]\
                         for r in l]

def make_ingredients_list(l):
    ingredients = [item for sublist in l for item in sublist]
    
def remove_empty(l):
    if type(l[0]) == list:
        return [list(filter(None,i)) for i in l]
    else:
        return [i.replace(',,',',').replace(', ,',', ') for i in l]

def extract_nouns(text):
    return " ".join([word \
                     for word,tag in nltk.pos_tag(text.split()) \
                     if(tag=="NN" or tag=="NNP" or tag=="NNS" or tag=="NNPS" or tag=="FW")])

def extract_nouns_in_list(l):
    df["noun_ing"] = [extract_nouns(i) for i in l]
    return df["noun_ing"]
    
def re_process(a):
    def re_p(text):
        text = re.sub(r"(.*)\b(chicken|salt)\b(.*)",r"\2",text)                                        #only keep keywords
        text = re.sub(r"(.*)\b(leaves|large|fresh|shredded|\
        plain|crushed|medium|ground)\b(.*)",r"\1\3",text)  #remove common adjectives
        #text = re.sub(r"(.*)\b(cheese|flour|milk|chilies|salt|oil\
        #|chicken|rice|wine|onion|beans|sugar)\b(.*)",r"\2",text)    #only keep keywords
        #text = re.sub("^(water|salt|pepper|oil|butter)$","",text)                               #remove common ingredients
        #text = re.sub(r"(.*)\
        #(ground|low fat|saturated|fresh|medium|flakes|low sodium|juice|dark|black|refried\
        #shredded|grated|extract|pitted|all-purpose|powder|juice|large|green|red|seedless\
        #blanched|sliced|crushed|wedgie|sharp|whole|wholesome|freshly|plain|and)\
        #\s(.*)",r"\1\3",text)                                                                   #remove common adjectives
        text = re.sub(r"(.*)(lime)(.*)",r"(\1)(lemon)(\2)",text)                                #replace synonymes ? (lemmatize)
        return text
    def list_re(b):
        return [re_p(i) for i in b]
    def str_re(b):
        return ",".join([re_p(i) for i in b.split(",")])
    if type(a[0])==list:
        return a.apply(list_re)
    else:
        return a.apply(str_re)
    
def make_str_ing(a):
    return a.apply(",".join)     #string ingredients
    
def make_und_ing(ing_list,ing_str=None):
    if type(ing_str) != pd.core.series.Series:
        ing_str = make_str_ing(ing_list)
    return [i.replace(" ","_").replace(","," ") for i in ing_str]

def make_dtm(a):
    vect = CountVectorizer(input="content",strip_accents="ascii",binary=True)
    vect.fit(list(a))
    pickle.dump(vect,open(model_name+"_vect.p","wb"))
    return vect.transform(list(a))

def stem(b,stemmer=None):
    def stemm(a):
        if type(a) == list:
            return [stemmer.stem(i) for i in a]
        else:
            return ",".join([stemmer.stem(i) for i in a.split(",")])
    def lemmatizer(a):
        if type(a) == list:
            return [WordNetLemmatizer().lemmatize(i) for i in a]
        else:
            return ",".join([WordNetLemmatizer().lemmatize(i) for i in a.split(",")])
    if stemmer==None:
        return b.apply(lemmatizer)
    else:
        return b.apply(stemm)

In [4]:
########################### Preprocessing ###############################
df = pickle.load(open("df_bckup3.p","rb"))
df["ing"] = re_process(df["nouns_ing"])
#df["ing"] = stem(df["ing"],LancasterStemmer())
df["ing"] = clean(df["ing"])
df["ing"] = remove_empty(df["ing"])
df["ing"] = make_und_ing(df["ing"])

dtm = make_dtm(df["ing"])

In [20]:
###################### For full train data ##################################
x_train = dtm
y_train = np.array(df["cuisine"])

In [5]:
#################### Split train data and cv data ############################
from sklearn.cross_validation import train_test_split

x_train,x_cv,y_train,y_cv = train_test_split(dtm,np.array(df["cuisine"]),test_size=0.3,random_state=1234)

In [6]:
#################### Setting Training and Testing Data #####################
from sklearn import tree
from sklearn import linear_model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from time import time

def classify(model,name):
    start_time = time()
    model = Pipeline([('feature_selection',LinearSVC(penalty="l1",dual=False)),('classification',model)])
    model.fit(x_train,y_train)
    acc = round(model.score(x_cv,y_cv),4)*100
    end_time = round(time()-start_time,0)
    print(name,acc,end_time)
    with open("log_classifiers.txt","a") as text:
        print(model_name+","+name+","+str(acc)+","+str(end_time),file=text)
    return acc,model

In [7]:
###################### Classifying single-iteration classifiers ####################
clfs = [
        (LinearSVC(loss='l2',penalty='l1',dual=False,tol=1e-3),"SVM1")
        #,(LinearSVC(loss='l2',penalty='l2',dual=False,tol=1e-3),"SVM2")
        #,(linear_model.LogisticRegression(C=1e5),"Logistic Regression")
        ]

best_acc = 0
best_name = ""
for clf,name in clfs:
    a,c = classify(clf,name)
    pickle.dump(c,open(model_name+"_"+name+".p","wb"))
    if a > best_acc:
        best_acc = a
        best_name = name

print(best_name,best_acc)



SVM1 76.67 46.0
SVM1 76.67


In [None]:
###################### Classifying multiple-iteration classifiers #################
clfs = [
        #(tree.DecisionTreeClassifier(),"Decision Tree")
        #,(RandomForestClassifier(n_estimators=500,n_jobs=5),"RF1")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=3)),n_estimators=500,learning_rate=1),"ADA 1")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=5)),n_estimators=500,learning_rate=1),"ADA 2")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=5)),n_estimators=1000,learning_rate=1),"ADA 3")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=5)),n_estimators=1400,learning_rate=1),"ADA 4")
        (AdaBoostClassifier((DecisionTreeClassifier(max_depth=7)),n_estimators=100,learning_rate=1),"ADA 5")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=9)),n_estimators=1200,learning_rate=1),"ADA 6")
        ,(RandomForestClassifier(n_estimators=700,n_jobs=5),"RF2")
        ]

n = 8

best_acc = 0
best_name = ""
for clf,name in clfs:
    a = [0] * n
    c = [0] * n
    for i in range(0,n):
        a[i],c[i] = classify(clf,name)
        if a[i] > best_acc:
            best_acc = a[i]
            best_name = name
    pickle.dump(c[a.index(max(a))],open(model_name+"_"+name+".p","wb"))

print(best_name,best_acc)

In [28]:
######### Percentage errors in individual cuisines reults for cv #########
from collections import Counter
a=Counter(c.predict(x_cv))
b=Counter(y_cv)
d=pd.DataFrame([(i[0],i[1],b[i[0]] - i[1],round((b[i[0]] - i[1])/i[1]*100)) for i in a.items()]).sort_values(by=3,ascending=False)
d.columns = ["cuisine","total","absolute false negatives","percentage false negatives"]
with open("error_cuisines.txt","w") as text:
    print(d,file=text)
d



Unnamed: 0,cuisine,total,absolute false negatives,percentage false negatives
13,brazilian,101,57,56
17,russian,99,54,55
0,spanish,208,102,49
16,irish,141,61,43
18,vietnamese,194,65,34
2,british,184,56,30
14,japanese,353,55,16
8,filipino,193,28,15
12,korean,238,33,14
10,jamaican,139,16,12


In [None]:
#################### make a data frame which shows similarity between cuisines ################
common = []
cuisines = set(y_cv)
for i in cuisines:
    for j in cuisines:
        if i==j:
            continue
        i1 = len(list(y_cv[y_cv==i]))                          #Total tuples in original dataset
        i2 = len((pred_cv[y_cv==i])[pred_cv[y_cv==i]==i])      #Total tuples in predicted
        i3 = 100 - round(i2/i1 * 100)                          #% inaccuracy
        i4 = len((pred_cv[y_cv==i])[pred_cv[y_cv==i]==j])      #Actually in cuisine 1 but predicted in cuisine 2
        common.append([i,j,i1,i2,i3,i4])
common = pd.DataFrame(common)
common.columns = ["cuisine 1","cuisine 2","total","pred","%","v"]
common = common.sort_values(by="v",ascending=False)
with open("cuisines_similarity.txt","w") as text:
    print(common,file=text)
    
common = pd.pivot_table(common,values = "v",\
                        columns=["cuisine 1","total","pred","%"], index = "cuisine 2")
with open("cuisines_similarity.txt","a") as text:
    print(common,file=text)

In [None]:
################################ Intro #####################################

#clean : (oz.),keep alpha_num,lowercase
#extract nouns
#regex

#cv : train_test_split - 30%