In [1]:
# Load library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
%matplotlib inline

## Import Data 

In [2]:
# Create URL
test_csv = "~/COMP30027_2021_Project2_datasets/recipe_test.csv"
train_csv = "~/COMP30027_2021_Project2_datasets/recipe_train.csv"

In [3]:
# Load Dataset 
test_df = pd.read_csv(test_csv )
train_df = pd.read_csv(train_csv)

In [4]:
# Load CountVec model for name 
import pickle
name_vectorizer = pickle.load(open("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
name_dict = name_vectorizer.vocabulary_



In [5]:
# Load CountVec model for steps
import pickle
step_vectorizer = pickle.load(open("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_steps_countvectorizer.pkl", "rb"))
step_dict = step_vectorizer.vocabulary_

In [6]:
# Load CountVec model for ingredients
import pickle
ingr_vectorizer = pickle.load(open("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_ingr_countvectorizer.pkl", "rb"))
ingr_dict = ingr_vectorizer.vocabulary_

In [7]:
# training set 
X_train = train_df[['n_steps', 'n_ingredients']]
y_train = train_df['duration_label']
X_test = test_df[['n_steps', 'n_ingredients']]

## CountVec Name

In [8]:
# Extract Features from name column using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
name_train = train_df['name'].values # Create an array of name for training set
name_test = test_df['name'].values # Create an array of name for testing set
X_train_name = name_vectorizer.fit_transform(name_train)
X_test_name = name_vectorizer.transform(name_test)

In [9]:
print(X_train_name.shape, X_test_name.shape)

(40000, 10892) (10000, 10892)


In [10]:
# Are there any documents in X_test whose values are all 0? Why might this happen?
print(len(X_test_name.sum(axis=1)==0))
# This is hypothetically possible - if every word in one of the test documents had never appeared in the training data.
# For long documents, this is exceedingly unlikely due to the appearance of grammatical "words" such as _the_, _is_, and so on.

10000


### Feature Selection

In [11]:
# Choose best attributes
# Find out what the best 10 features were for your name dataset, according to  𝜒2
from sklearn.feature_selection import SelectKBest, chi2

x2 = SelectKBest(chi2, k=10)
X_train_name_x2 = x2.fit_transform(X_train_name,y_train) # Create a sparse matrix for CountVectorizer
X_test_name_x2 = x2.transform(X_test_name)

for feat_num in x2.get_support(indices=True):
    print(name_vectorizer.get_feature_names()[feat_num])

beef
cake
casserole
cooker
crock
crockpot
pot
roast
salad
slow


In [12]:
X_train_name_x2

<40000x10 sparse matrix of type '<class 'numpy.int64'>'
	with 7495 stored elements in Compressed Sparse Row format>

These seem like words that could be relevant to trying to distinguish between duration classification where salad could be made relatively fast and beef could be made relatively slow

Perhaps suprisingly are words like slow, cooker, crock and crockpot which are indicative of the cooking utensils not the food itself (and perhaps not of the problem more generally). It's difficult to determine the rare/common distinction here, but it becomes a little clearer as we look further down the ranking.

In [13]:
# Do the same thing for Mutual Information, instead of  𝜒2 (note that you want the classification version, not the regression version).
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_name_mi = mi.fit_transform(X_train_name,y_train)
X_test_name_mi = mi.transform(X_test_name)

print(X_train_name_mi.shape, X_test_name_mi.shape)

for feat_num in mi.get_support(indices=True):
    print(name_vectorizer.get_feature_names()[feat_num])

(40000, 10) (10000, 10)
beef
cake
casserole
chicken
cooker
crock
pot
roast
salad
slow


Here we see more evidence of MI choosing frequently-occuring features, such as slow.

## Modelling

Build a classifier on the training dataset, and evaluate its Accuracy on the test set. Consider k-NN, and perhaps Naive Bayes or Decision Trees

It’s likely that the dataset is still small enough that you can build a model on the entire feature set (after the CountVectorizer, but before the SelectKBest) without crashing your computer. How well do these models predict the test data, using all of the features?

How does this compare with 1000 features, or just the top 10 features?

Try some different values for the cut-off for SelectKBest — is it possible to improve upon the Accuracy observed for the models which use the entire feature set? Is this more true for some learners than others? Does your choice between χ 2 and Mutual Information make a difference?

In [14]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Models to train on 
models = [GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(max_depth=1)]
#          KNeighborsClassifier(n_neighbors=1),
#          KNeighborsClassifier(n_neighbors=5),
#          DecisionTreeClassifier(max_depth=None)]
#          svm.LinearSVC(C=C),
#          svm.SVC(kernel='rbf', gamma=0.7, C=C),
#          svm.SVC(kernel='poly', degree=3, C=C)]

# Model Titles
titles = ['GNB',
          'MNB',
          'one-r']
#          '1-nearest neighbour',
#          '5-nearest neighbour',
#          'Decision Tree']
#          'LinearSVC',
#          'SVM with a cubic kernel',
#          'SVM with an RBF kernel']

# Select Number of Features
k = 1000

# Chi-square
x2 = SelectKBest(chi2, k=k)
x2.fit(X_train_name,y_train)
X_train_name_x2 = x2.transform(X_train_name)
X_test_name_x2 = x2.transform(X_test_name)

# Mutual Information 
mi = SelectKBest(score_func=mutual_info_classif, k=k)
mi.fit(X_train_name,y_train)
X_train_name_mi = mi.transform(X_train_name)
X_test_name_mi = mi.transform(X_test_name)

# Fit the model and test the model 
Xs = [(X_train_name, X_test_name), (X_train_name_x2, X_test_name_x2), (X_train_name_mi, X_test_name_mi)]
X_names = ['complete', 'x2', 'mi']
for title, model in zip(titles, models):
    print('\n',title, '(with k=',k,'features):')
    for X_name, X in zip(X_names, Xs):
        X_train_t, X_test_t = X
        # convert the variable into a matrix and train it with the selected model 
        model.fit(X_train_t.todense(), y_train)
        acc = model.score(X_train_t.todense(), y_train)
        print(X_name, 'acc',  acc)


 GNB (with k= 1000 features):
complete acc 0.419725
x2 acc 0.5769
mi acc 0.2231

 MNB (with k= 1000 features):
complete acc 0.756375
x2 acc 0.69515
mi acc 0.70625

 one-r (with k= 1000 features):
complete acc 0.5185
x2 acc 0.5185
mi acc 0.5185


Here, we could observe that Multinomial Naive Bayes is the best model and Mutual Information is the best feature extraction method for the feature name in the training set

## CountVec Steps

In [15]:
# Extract Features from steps column using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
step_train = train_df['steps'].values # Create an array of name for training set
step_test = test_df['steps'].values # Create an array of name for testing set
X_train_step = step_vectorizer.fit_transform(step_train)
X_test_step = step_vectorizer.transform(step_test)

In [16]:
print(X_train_step.shape, X_test_step.shape)

(40000, 17967) (10000, 17967)


### Feature Selection

In [17]:
# Choose best attributes
# Find out what the best 10 features were for your name dataset, according to  𝜒2
from sklearn.feature_selection import SelectKBest, chi2

x2 = SelectKBest(chi2, k=10)
X_train_step_x2 = x2.fit_transform(X_train_step,y_train) # Create a sparse matrix for CountVectorizer
X_test_step_x2 = x2.transform(X_test_step)

for feat_num in x2.get_support(indices=True):
    print(step_vectorizer.get_feature_names()[feat_num])

30
bake
baking
cooker
crock
crockpot
hours
minutes
oven
slow


The best attributes for step column is similar to name column. This might due to the fact that words that describe time such as hours, minutes, slow indicate specific duration labels 

In [18]:
# Do the same thing for Mutual Information, instead of  𝜒2 (note that you want the classification version, not the regression version).
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_step_mi = mi.fit_transform(X_train_step,y_train)
X_test_step_mi = mi.transform(X_test_step)

print(X_train_step_mi.shape, X_test_step_mi.shape)

for feat_num in mi.get_support(indices=True):
    print(step_vectorizer.get_feature_names()[feat_num])

(40000, 10) (10000, 10)
30
350
45
bake
baking
crock
hours
minutes
oven
preheat


Here, we could see that MI provide more words that describe time such as hours, minutes, 30, 350 and 45. 

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Models to train on 
models = [GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(max_depth=1)]
#          KNeighborsClassifier(n_neighbors=1),
#          KNeighborsClassifier(n_neighbors=5),
#          DecisionTreeClassifier(max_depth=None)]
#          svm.LinearSVC(C=C),
#          svm.SVC(kernel='rbf', gamma=0.7, C=C),
#          svm.SVC(kernel='poly', degree=3, C=C)]

# Model Titles
titles = ['GNB',
          'MNB',
          'one-r']
#          '1-nearest neighbour',
#          '5-nearest neighbour',
#          'Decision Tree']
#          'LinearSVC',
#          'SVM with a cubic kernel',
#          'SVM with an RBF kernel']

# Select Number of Features
k = 1000

# Chi-square
x2 = SelectKBest(chi2, k=k)
x2.fit(X_train_step,y_train)
X_train_step_x2 = x2.transform(X_train_step)
X_test_step_x2 = x2.transform(X_test_step)

# Mutual Information 
mi = SelectKBest(score_func=mutual_info_classif, k=k)
mi.fit(X_train_step,y_train)
X_train_step_mi = mi.transform(X_train_step)
X_test_step_mi = mi.transform(X_test_step)

# Chi-square -> (X_train_step_x2, X_test_step_x2)
# Mutual Information -> (X_train_step_mi, X_test_step_mi) 

# Fit the model and test the model 
Xs = [(X_train_step, X_test_step), (X_train_step_x2, X_test_step_x2), (X_train_step_mi, X_test_step_mi)]
X_names = ['complete', 'x2', 'mi']
for title, model in zip(titles, models):
    print('\n',title, '(with k=',k,'features):')
    for X_name, X in zip(X_names, Xs):
        X_train_t, X_test_t = X
        # convert the variable into a matrix and train it with the selected model 
        model.fit(X_train_t.todense(), y_train)
        acc = model.score(X_train_t.todense(), y_train)
        print(X_name, 'acc',  acc)


 GNB (with k= 1000 features):
complete acc 0.40335
x2 acc 0.650025
mi acc 0.5815

 MNB (with k= 1000 features):
complete acc 0.751825
x2 acc 0.711875
mi acc 0.716325

 one-r (with k= 1000 features):
complete acc 0.6481
x2 acc 0.6481
mi acc 0.6481


Here, we could observe that Multinomial Naive Bayes is the best model and Mutual Information is the best feature extraction method for the feature step in the training set

## CountVec Ingredients

In [20]:
# Extract Features from name column using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
ingr_train = train_df['ingredients'].values # Create an array of name for training set
ingr_test = test_df['ingredients'].values # Create an array of name for testing set
X_train_ingr = ingr_vectorizer.fit_transform(ingr_train)
X_test_ingr = ingr_vectorizer.transform(ingr_test)

In [21]:
print(X_train_ingr.shape, X_test_ingr.shape)

(40000, 2906) (10000, 2906)


### Feature Selection

In [22]:
# Choose best attributes
# Find out what the best 10 features were for your name dataset, according to  𝜒2
from sklearn.feature_selection import SelectKBest, chi2

x2 = SelectKBest(chi2, k=10)
X_train_ingr_x2 = x2.fit_transform(X_train_ingr,y_train) # Create a sparse matrix for CountVectorizer
X_test_ingr_x2 = x2.transform(X_test_ingr)

for feat_num in x2.get_support(indices=True):
    print(ingr_vectorizer.get_feature_names()[feat_num])

baking
beef
butter
chicken
eggs
flour
potatoes
roast
soup
stew


In the ingredients colum, we could observe that both the cooking materials and cooking methods are selected as among the best attributes

In [23]:
# Do the same thing for Mutual Information, instead of  𝜒2 (note that you want the classification version, not the regression version).
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_ingr_mi = mi.fit_transform(X_train_ingr,y_train)
X_test_ingr_mi = mi.transform(X_test_ingr)

print(X_train_ingr_mi.shape, X_test_ingr_mi.shape)

for feat_num in mi.get_support(indices=True):
    print(ingr_vectorizer.get_feature_names()[feat_num])

(40000, 10) (10000, 10)
baking
beef
butter
eggs
flour
onion
potatoes
roast
salt
sugar


Similar to chi-square. Both cooking methods and cooking materials are among the best attribites

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Models to train on 
models = [GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(max_depth=1)]
#          KNeighborsClassifier(n_neighbors=1),
#          KNeighborsClassifier(n_neighbors=5),
#          DecisionTreeClassifier(max_depth=None)]
#          svm.LinearSVC(C=C),
#          svm.SVC(kernel='rbf', gamma=0.7, C=C),
#          svm.SVC(kernel='poly', degree=3, C=C)]

# Model Titles
titles = ['GNB',
          'MNB',
          'one-r']
#          '1-nearest neighbour',
#          '5-nearest neighbour',
#          'Decision Tree']
#          'LinearSVC',
#          'SVM with a cubic kernel',
#          'SVM with an RBF kernel']

# Select Number of Features
k = 1000

# Chi-square
x2 = SelectKBest(chi2, k=k)
x2.fit(X_train_ingr,y_train)
X_train_ingr_x2 = x2.transform(X_train_ingr)
X_test_ingr_x2 = x2.transform(X_test_ingr)

# Mutual Information 
mi = SelectKBest(score_func=mutual_info_classif, k=k)
mi.fit(X_train_ingr,y_train)
X_train_ingr_mi = mi.transform(X_train_ingr)
X_test_ingr_mi = mi.transform(X_test_ingr)

# Chi-square -> (X_train_ingr_x2, X_test_ingr_x2)
# Mutual Information -> (X_train_ingr_mi, X_test_ingr_mi)

# Fit the model and test the model 
Xs = [(X_train_ingr, X_test_ingr), (X_train_ingr_x2, X_test_ingr_x2) , (X_train_ingr_mi, X_test_ingr_mi)]
X_names = ['complete', 'x2', 'mi']
for title, model in zip(titles, models):
    print('\n',title, '(with k=',k,'features):')
    for X_name, X in zip(X_names, Xs):
        X_train_t, X_test_t = X
        # convert the variable into a matrix and train it with the selected model 
        model.fit(X_train_t.todense(), y_train)
        acc = model.score(X_train_t.todense(), y_train)
        print(X_name, 'acc',  acc)


 GNB (with k= 1000 features):
complete acc 0.189275
x2 acc 0.51075
mi acc 0.23255

 MNB (with k= 1000 features):
complete acc 0.638575
x2 acc 0.627375
mi acc 0.62705

 one-r (with k= 1000 features):
complete acc 0.537825
x2 acc 0.537825
mi acc 0.537825


Here, we could observe that Multinomial Naive Bayes is the best model and Mutual Information is the best feature extraction method for the feature ingredients in the training set

# Joining Training Features for CountVec

In [342]:
# Use columns with string
new_train_df = X_train

In [343]:
# Create a sparse matrix for name
mi_matrix = X_train_name_mi.todense()
# Convert matrix to list
mi_list = mi_matrix.tolist()
# Convert list to dataframe
mi_df = pd.DataFrame(mi_list)
mi_df.shape

(40000, 1000)

In [344]:
# Save Bag of Words of name in training set
# mi_df.to_csv('mi_train_name_countvec.csv', index = False)

In [345]:
# Join the name features to the training set
join_train_df = X_train.join(mi_df)
join_train_df.shape

(40000, 1002)

In [346]:
mi_matrix = X_train_step_mi.todense()
mi_list = mi_matrix.tolist()

mi_step_df = pd.DataFrame(mi_list)

mi_step_df = mi_step_df.add_suffix('step')
mi_step_df.shape

(40000, 1000)

In [347]:
# Save Bag of Words of steps in training set
# mi_step_df.to_csv('mi_train_step_countvec.csv', index = False)

In [348]:
join_train_df = join_train_df.join(mi_step_df)
join_train_df.shape

(40000, 2002)

In [349]:
mi_matrix = X_train_ingr_mi.todense()
mi_list = mi_matrix.tolist()

mi_ingr_df = pd.DataFrame(mi_list)

mi_ingr_df = mi_ingr_df.add_suffix('ingr')

mi_ingr_df.shape

(40000, 1000)

In [350]:
# Save Bag of Words of steps in training set
# mi_ingr_df.to_csv('mi_train_ingr_countvec.csv', index = False)

In [351]:
join_train_df = join_train_df.join(mi_ingr_df)
join_train_df.shape

(40000, 3002)

In [35]:
# Save All Bag of Words in training set
# join_train_df.to_csv('join_train_countvec.csv', index = False)

# Joining Test Features For CountVec

In [36]:
mi_matrix = X_test_name_mi.todense()
mi_list = mi_matrix.tolist()

mi_name_df = pd.DataFrame(mi_list)

In [37]:
# Save Bag of Words of name in testing set
# mi_name_df.to_csv('mi_test_name_countvec.csv', index = False)

In [38]:
join_test_df = X_test.join(mi_name_df)
join_test_df.shape

(10000, 1002)

In [39]:
# MI for step
mi_matrix = X_test_step_mi.todense()
mi_list = mi_matrix.tolist()

mi_step_df = pd.DataFrame(mi_list)
mi_step_df = mi_step_df.add_suffix('step')

In [40]:
# Save Bag of Words of steps in testing set
# mi_step_df.to_csv('mi_test_step_countvec.csv', index = False)

In [41]:
join_test_df = join_test_df.join(mi_step_df)
join_test_df.shape

(10000, 2002)

In [42]:
# MI for ingr
mi_matrix = X_test_ingr_mi.todense()
mi_list = mi_matrix.tolist()

mi_ingr_df = pd.DataFrame(mi_list)
mi_ingr_df = mi_ingr_df.add_suffix('ingr')

In [43]:
# Save Bag of Words of ingr in testing set
# mi_ingr_df.to_csv('mi_test_ingr_countvec.csv', index = False)

In [44]:
join_test_df = join_test_df.join(mi_ingr_df)
join_test_df.shape

(10000, 3002)

In [45]:
# Save All Bag of Words in testing set
# join_test_df.to_csv('join_test_countvec.csv', index = False)

In [46]:
'''
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

models = [GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(max_depth=1),
          #KNeighborsClassifier(),
          LogisticRegression()]
titles = ['GNB',
          'MNB',
          'one-r',
          #'KNN',
          'Logistic Regression']

for title, model in zip(titles, models):
    model.fit(join_train_df,y_train)
    acc = model.score(join_train_df, y_train)
    print(title, "Accuracy:",acc)
''''

SyntaxError: EOL while scanning string literal (<ipython-input-46-d7201a8039fa>, line 21)

## Evaluation - Holdout or Cross Validation

### Hold out Strategy

In [47]:
# Split the training set into train and test set
from sklearn.model_selection import train_test_split
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(join_train_df, y_train, test_size=0.33, random_state=88)

In [48]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(X_train_split,y_train_split)
acc = lgr.score(X_test_split, y_test_split)
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.7999242424242424


### Cross Validation Strategy

In [49]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lgr = LogisticRegression()
lgr.fit(join_train_df, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
acc = np.mean(cross_val_score(lgr, join_train_df, y_train, cv=5))
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Accuracy: 0.79975


Both Holdout and Cross-validation get similar accuracy score. However, holdout is significantly faster than cross validation and for this reason, holdout strategy will be chosen 

## Model Selection 

In [51]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_split, y_train_split)
acc = gnb.score(X_test_split, y_test_split)
print("Accuracy:",acc)

Accuracy: 0.36984848484848487


In [52]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_split, y_train_split)
acc = mnb.score(X_test_split, y_test_split)
print("Accuracy:",acc)

Accuracy: 0.7175


In [53]:
from sklearn.tree import DecisionTreeClassifier
one_r =DecisionTreeClassifier(max_depth=1)
one_r.fit(X_train_split, y_train_split)
acc = one_r.score(X_test_split, y_test_split)
print("Accuracy:",acc)

Accuracy: 0.6552272727272728


In [54]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_split, y_train_split)
acc = dt.score(X_test_split, y_test_split)
print("Accuracy:",acc)

Accuracy: 0.7320454545454546


# Training

In [308]:
# Split the training set into train and test set
from sklearn.model_selection import train_test_split
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(join_train_df, y_train, test_size=0.33, random_state=88)

In [77]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(X_train_split,y_train_split)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [78]:
acc = lgr.score(X_test_split, y_test_split)
print("Accuracy:",acc)

Accuracy: 0.7999242424242424


# Ensemble - Stacking

In [66]:
from sklearn.metrics import accuracy_score

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


classifiers = [#DummyClassifier(strategy='most_frequent'),
                LogisticRegression(),
                #KNeighborsClassifier(),
                GaussianNB(),
                MultinomialNB(),
                DecisionTreeClassifier()]
titles = [#'Zero_R',
          'Logistic Regression',
          #'KNN',
          'Gaussian NB',  
          'Multinomial NB',
          'Decision Tree']



meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)

In [67]:
stacker_lr.fit(X_train_split, y_train_split)
print('\nStacker Accuracy (Logistic Regression):', stacker_lr.score(X_test_split, y_test_split))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Stacker Accuracy (Logistic Regression): 0.7303787878787878


## Prediction

In [79]:
ybar = lgr.predict(join_test_df)

In [80]:
test_id = test_df.index
data = {'id': test_id+1, 'duration_label': ybar}
df = pd.DataFrame(data)
df.to_csv('predict.csv', index=False)

# Doc2Vec

## Import Doc2Vec data

In [90]:
# Load Doc2Vec Dataset 
train_name_doc2vec100 = pd.read_csv("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_name_doc2vec100.csv", header=None)
test_name_doc2vec100 = pd.read_csv("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/test_name_doc2vec100.csv", header=None)
train_steps_doc2vec100 = pd.read_csv("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", header=None)
test_steps_doc2vec100 = pd.read_csv("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/test_steps_doc2vec100.csv", header=None)
train_ingr_doc2vec100 = pd.read_csv("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv", header=None)
test_ingr_doc2vec100 = pd.read_csv("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/test_ingr_doc2vec100.csv", header=None)

In [91]:
print(train_name_doc2vec100.shape, train_steps_doc2vec100.shape, X_train.shape)

(40000, 100) (40000, 100) (40000, 2)


## Joining Training Features for Doc2Vec

In [403]:
train_name_doc2vec100 = train_name_doc2vec100.add_suffix('name_doc2vec')
join_train_doc2vec100 = X_train.join(train_name_doc2vec100)
train_steps_doc2vec100 = train_steps_doc2vec100.add_suffix('step_doc2vec')
join_train_doc2vec100 = join_train_doc2vec100.join(train_steps_doc2vec100)
train_ingr_doc2vec100 = train_ingr_doc2vec100.add_suffix('ingr_doc2vec')
join_train_doc2vec100 = join_train_doc2vec100.join(train_ingr_doc2vec100)
join_train_doc2vec100.shape

(40000, 302)

In [404]:
# Split the training set into train and test set
from sklearn.model_selection import train_test_split
X_train_doc2vec100_split, X_test_doc2vec100_split, y_train_doc2vec100_split, y_test_doc2vec100_split = train_test_split(join_train_doc2vec100, y_train, test_size=0.33, random_state=88)

In [95]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(X_train_doc2vec100_split, y_train_doc2vec100_split)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [96]:
acc = lgr.score(X_test_doc2vec100_split, y_test_doc2vec100_split)
print("Accuracy:",acc)

Accuracy: 0.7272727272727273


The accuracy trained using Doc2Vec is 0.727

# Neural Network

Train a neural network on X_train_split

In [169]:
from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical

In [391]:
from sklearn import preprocessing

X_train_split_neural = X_train_split.drop(columns = ['n_steps', 'n_ingredients'])
X_test_split_neural = X_test_split.drop(columns = ['n_steps', 'n_ingredients'])

# Create Scalar
scaler = preprocessing.StandardScaler()

# Transform the feature
X_train_split_neural = scaler.fit_transform(X_train_split_neural)
X_test_split_neural = scaler.fit_transform(X_test_split_neural)

In [392]:
# Set the number of featuers we want
number_of_features = 3000

In [393]:
y_train_split_neural = y_train_split.values - 1
y_test_split_neural = y_test_split.values - 1

In [394]:
# One-hot encode target vector to create a target matrix
y_train_split_cat = to_categorical(y_train_split_neural)
y_test_split_cat = to_categorical(y_test_split_neural)

In [395]:
print(y_train_split_cat.shape)

(26800, 3)


In [396]:
# Start neural network
network = models.Sequential()

In [397]:
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=300,
                        activation = "relu",
                        input_shape=(number_of_features,)))

In [398]:
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=300, activation="relu"))

In [399]:
#Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=3, activation="softmax"))

In [400]:
#Compile neural network
network.compile(loss="categorical_crossentropy", #Cross-entropy
               optimizer = "rmsprop", #Root Mean Square Propagation
               metrics=["accuracy"]) # Accuracy performance metric

In [389]:
# Train neural network
history = network.fit(X_train_split_neural, #Features
                     y_train_split_cat, #Target
                     epochs=3, #Three epochs
                     verbose=1, #No output
                     batch_size=100, #Number of observations per batch
                     validation_data=(X_test_split_neural, y_test_split_cat)) # test data

Train on 26800 samples, validate on 13200 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [410]:
# join_train_doc2vec100, join_train_df
join_train_all = join_train_df.drop(columns = ['n_steps', 'n_ingredients'])
join_train_all = join_train_all.join(join_train_doc2vec100)
join_train_all.shape

(40000, 3302)

In [411]:
join_test_all = join_test_df.drop(columns = ['n_steps', 'n_ingredients'])
join_test_all = join_test_all.join(join_train_doc2vec100)
join_test_all.shape

(10000, 3302)

In [407]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(join_all, y_train)
acc = lgr.score(join_train_all, y_train)
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.83175


In [412]:
ybar = lgr.predict(join_test_all)

In [413]:
test_id = test_df.index
data = {'id': test_id+1, 'duration_label': ybar}
df = pd.DataFrame(data)
df.to_csv('predict.csv', index=False)