In [1]:
################################ Setup #####################################

model_name = "model_2_N_0"

import pickle
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import pprint
import sklearn.preprocessing as pp
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option("max_colwidth",500)
pd.set_option("display.max_rows",None)
pd.set_option('expand_frame_repr', False)

df = pickle.load(open("df_bckup3.p","rb"))

In [2]:
############################## Preprocessing functions #####################
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def clean(l):
    return [[re.sub(r"^\s",r"",re.sub(r"\W"," ",re.sub(r"(.*)((.*)oz.(.*)\))(.*)",r"\1\5",i))).lower() for i in r]\
                         for r in l]

def make_ingredients_list(l):
    ingredients = [item for sublist in l for item in sublist]
    
def remove_empty(l):
    if type(l[0]) == list:
        return [list(filter(None,i)) for i in l]
    else:
        return [i.replace(',,',',').replace(', ,',', ') for i in l]

def extract_nouns(text):
    return " ".join([word \
                     for word,tag in nltk.pos_tag(text.split()) \
                     if(tag=="NN" or tag=="NNP" or tag=="NNS" or tag=="NNPS" or tag=="FW")])

def extract_nouns_in_list(l):
    df["noun_ing"] = [extract_nouns(i) for i in l]
    return df["noun_ing"]
    
def re_process(a):
    def re_p(text):
        text = re.sub(r"(.*)\b(chicken|salt)\b(.*)",r"\2",text)                                        #only keep keywords
        text = re.sub(r"(.*)\b(leaves|large|fresh|shredded|\
        plain|crushed|medium|ground)\b(.*)",r"\1\3",text)  #remove common adjectives
        #text = re.sub(r"(.*)\b(cheese|flour|milk|chilies|salt|oil\
        #|chicken|rice|wine|onion|beans|sugar)\b(.*)",r"\2",text)    #only keep keywords
        #text = re.sub("^(water|salt|pepper|oil|butter)$","",text)                               #remove common ingredients
        #text = re.sub(r"(.*)\
        #(ground|low fat|saturated|fresh|medium|flakes|low sodium|juice|dark|black|refried\
        #shredded|grated|extract|pitted|all-purpose|powder|juice|large|green|red|seedless\
        #blanched|sliced|crushed|wedgie|sharp|whole|wholesome|freshly|plain|and)\
        #\s(.*)",r"\1\3",text)                                                                   #remove common adjectives
        text = re.sub(r"(.*)(lime)(.*)",r"(\1)(lemon)(\2)",text)                                #replace synonymes ? (lemmatize)
        return text
    def list_re(b):
        return [re_p(i) for i in b]
    def str_re(b):
        return ",".join([re_p(i) for i in b.split(",")])
    if type(a[0])==list:
        return a.apply(list_re)
    else:
        return a.apply(str_re)
    
def make_str_ing(a):
    return a.apply(",".join)     #string ingredients
    
def make_und_ing(ing_list,ing_str=None):
    if type(ing_str) != pd.core.series.Series:
        ing_str = make_str_ing(ing_list)
    return [i.replace(" ","_").replace(","," ") for i in ing_str]

def make_dtm(a):
    vect = CountVectorizer(input="content",strip_accents="ascii",binary=True)
    vect.fit(list(a))
    pickle.dump(vect,open(model_name+"_vect.p","wb"))
    return vect.transform(list(a))

def stem(b,stemmer=None):
    def stemm(a):
        if type(a) == list:
            return [stemmer.stem(i) for i in a]
        else:
            return ",".join([stemmer.stem(i) for i in a.split(",")])
    def lemmatizer(a):
        if type(a) == list:
            return [WordNetLemmatizer().lemmatize(i) for i in a]
        else:
            return ",".join([WordNetLemmatizer().lemmatize(i) for i in a.split(",")])
    if stemmer==None:
        return b.apply(lemmatizer)
    else:
        return b.apply(stemm)

In [3]:
########################### Preprocessing ###############################
df = pickle.load(open("df_bckup3.p","rb"))
df["ing"] = re_process(df["nouns_ing"])
#df["ing"] = stem(df["ing"],LancasterStemmer())
df["ing"] = clean(df["ing"])
df["ing"] = remove_empty(df["ing"])
df["ing"] = make_und_ing(df["ing"])

dtm = make_dtm(df["ing"])

In [4]:
###################### For full train data ##################################
x_train = dtm
y_train = np.array(df["cuisine"])

In [5]:
#################### Setting Training and Testing Data #####################
from sklearn import tree
from sklearn import linear_model
from sklearn import cross_validation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from time import time

def classify(model,name):
    start_time = time()
    model = Pipeline([('feature_selection',LinearSVC(penalty="l1",dual=False)),('classification',model)])
    acc = cross_validation.cross_val_score(model,x_train,y_train,cv=2)
    acc = [round(i,4)*100 for i in acc]
    model.fit(x_train,y_train)
    end_time = round(time()-start_time,0)
    print(name,acc,":",np.mean(acc),end_time)
    with open("log_classifiers.txt","a") as text:
        print(model_name+","+name+","+str(np.mean(acc))+","+str(end_time),file=text)
    return np.mean(acc),model

In [6]:
#x_train = np.array(dtm)
#y_train = np.array(df["cuisine"])
x_train.shape
type(x_train)
print(np.in1d(y_train,combo))

NameError: name 'combo' is not defined

In [None]:
###################### 2-level single-iteration classifiers ####################
clfs = [
        (LinearSVC(loss='l2',penalty='l1',dual=False,tol=1e-3),"SVM1")
        ,(LinearSVC(loss='l2',penalty='l2',dual=False,tol=1e-3),"SVM2")
        ,(linear_model.LogisticRegression(C=1e5),"Logistic Regression")
        ]

combos = ['brazilian','british','cajun_creole','chinese',\
                               'filipino','greek','indian','irish',\
                               'jamaican','japanese','korean','mexican','moroccan','russian',\
                               'southern_us','spanish','thai','vietnamese']
combos = [[i] for i in combos]
combos.append(["french","italian"])

### Level 1 ####
x_train_org = x_train
y_train_org = np.array(list(y_train))
for combo in combos:
    new = "_".join(combo)
    for cuisine in combo:
        y_train[y_train==cuisine] = new
best_acc = 0
best_name = ""
for clf,name in clfs:
    a,c = classify(clf,name)
#    pickle.dump(c,open(model_name+"_"+name+".p","wb"))
    if a > best_acc:
        best_acc = a
        best_name = name
print(best_name,best_acc)

#### Level 2 ####
for combo in combos:
    if len(combo)==1:
        continue
    y_train = y_train_org
    x_train = x_train_org
    y_train = y_train[np.in1d(y_train,combo)]
    x_train = x_train[np.in1d(y_train,combo)]
    print(combo)
    best_acc = 0
    best_name = ""
    for clf,name in clfs:
        a,c = classify(clf,name+"___"+("_".join(combo)))
        pickle.dump(c,open(model_name+"_"+name+"___"+("_".join(combo))+".p","wb"))
        if a > best_acc:
            best_acc = a
            best_name = name
    print(best_name,best_acc,"\n\n")
y_train = y_train_org
x_train = x_train_org

In [None]:
###################### Classifying multiple-iteration classifiers #################
clfs = [
        #,(tree.DecisionTreeClassifier(),"Decision Tree")
        ,(RandomForestClassifier(n_estimators=500,n_jobs=5),"RF1")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=3)),n_estimators=500,learning_rate=1),"ADA 1")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=5)),n_estimators=500,learning_rate=1),"ADA 2")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=5)),n_estimators=1000,learning_rate=1),"ADA 3")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=5)),n_estimators=1400,learning_rate=1),"ADA 4")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=7)),n_estimators=100,learning_rate=1),"ADA 5")
        #,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=7)),n_estimators=300,learning_rate=1),"ADA 6")
        ,(AdaBoostClassifier((DecisionTreeClassifier(max_depth=9)),n_estimators=100,learning_rate=1),"ADA 7")
        ,(RandomForestClassifier(n_estimators=700,n_jobs=5),"RF2")
        ]

n = 8

### Level 1 ####
x_train_org = x_train
y_train_org = np.array(list(y_train))
for combo in combos:
    new = "_".join(combo)
    for cuisine in combo:
        y_train[y_train==cuisine] = new
best_acc = 0
best_name = ""
for clf,name in clfs:
    a = [0] * n
    c = [0] * n
    for i in range(0,n):
        a[i],c[i] = classify(clf,name)
        if a[i] > best_acc:
            best_acc = a[i]
            best_name = name
    pickle.dump(c[a.index(max(a))],open(model_name+"_"+name+".p","wb"))
    print(best_name,best_acc)

#### Level 2 ####
for combo in combos:
    if len(combo)==1:
        continue
    y_train = y_train_org[np.in1d(y_train,combo)]
    x_train = x_train_org[np.in1d(y_train,combo)]
    print(combo)
    best_acc = 0
    best_name = ""
    for clf,name in clfs:
    a = [0] * n
    c = [0] * n
    for i in range(0,n):
        a[i],c[i] = classify(clf,name+"___"+("_".join(combo)))
        if a[i] > best_acc:
            best_acc = a[i]
            best_name = name
    pickle.dump(c[a.index(max(a))],open(model_name+"_"+name+"___"+("_".join(combo))+".p","wb"))
    print(best_name,best_acc)
y_train = y_train_org
x_train = x_train_org

In [None]:
#################### make a data frame which shows similarity between cuisines ################
common = []
cuisines = set(y_cv)
for i in cuisines:
    for j in cuisines:
        if i==j:
            continue
        i1 = len(list(y_cv[y_cv==i]))                          #Total tuples in original dataset
        i2 = len((pred_cv[y_cv==i])[pred_cv[y_cv==i]==i])      #Total tuples in predicted
        i3 = 100 - round(i2/i1 * 100)                          #% inaccuracy
        i4 = len((pred_cv[y_cv==i])[pred_cv[y_cv==i]==j])      #Actually in cuisine 1 but predicted in cuisine 2
        common.append([i,j,i1,i2,i3,i4])
common = pd.DataFrame(common)
common.columns = ["cuisine 1","cuisine 2","total","pred","%","v"]
common = common.sort_values(by="v",ascending=False)
with open("cuisines_similarity.txt","w") as text:
    print(common,file=text)
    
common = pd.pivot_table(common,values = "v",\
                        columns=["cuisine 1","total","pred","%"], index = "cuisine 2")
with open("cuisines_similarity.txt","a") as text:
    print(common,file=text)

In [None]:
with open("test.txt") as text:

    print(common[["cuisine 1","cuisine 2"]][common["v"]<],file=text)

In [None]:
#common["v"].plot()
#plt.show()
##### ########
(set(y_train_org))
#y_train.shape

In [None]:
i = np.array([1,2,2,3,2,3,4,5])
i1 = np.array(i)
i1[i1==2] = 100
i==i1

In [None]:
################################ Intro #####################################

#clean : (oz.),keep alpha_num,lowercase
#extract nouns
#regex

#cv : k-fold
#classification : 2-level