In [1]:
# Load library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
%matplotlib inline

## Import Data 

In [2]:
# Create URL
test_csv = "~/COMP30027_2021_Project2_datasets/recipe_test.csv"
train_csv = "~/COMP30027_2021_Project2_datasets/recipe_train.csv"

In [3]:
# Load Dataset 
test_df = pd.read_csv(test_csv )
train_df = pd.read_csv(train_csv)

In [4]:
# Load CountVec model
import pickle
name_vectorizer = pickle.load(open("/Users/hesterlim/COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
name_dict = name_vectorizer.vocabulary_



In [5]:
# training set 
X_train = train_df[['n_steps', 'n_ingredients']]
y_train = train_df['duration_label']

## CountVec Name

In [6]:
# Extract Features from name column using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
name_train = train_df['name'].values # Create an array of name for training set
name_test = test_df['name'].values # Create an array of name for testing set
X_train_name = name_vectorizer.fit_transform(name_train)
X_test_name = name_vectorizer.transform(name_test)

In [7]:
print(X_train_name.shape, X_test_name.shape)

(40000, 10892) (10000, 10892)


In [8]:
# Are there any documents in X_test whose values are all 0? Why might this happen?
print(len(X_test_name.sum(axis=1)==0))
# This is hypothetically possible - if every word in one of the test documents had never appeared in the training data.
# For long documents, this is exceedingly unlikely due to the appearance of grammatical "words" such as _the_, _is_, and so on.

10000


### Feature Selection

In [9]:
# Choose best attributes
# Find out what the best 10 features were for your name dataset, according to  𝜒2
from sklearn.feature_selection import SelectKBest, chi2

x2 = SelectKBest(chi2, k=10)
X_train_x2 = x2.fit_transform(X_train_name,y_train) # Create a sparse matrix for CountVectorizer
X_test_x2 = x2.transform(X_test_name)

for feat_num in x2.get_support(indices=True):
    print(name_vectorizer.get_feature_names()[feat_num])

beef
cake
casserole
cooker
crock
crockpot
pot
roast
salad
slow


In [20]:
X_train_x2

<40000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 81901 stored elements in Compressed Sparse Row format>

These seem like words that could be relevant to trying to distinguish between duration classification where salad could be made relatively fast and beef could be made relatively slow

Perhaps suprisingly are words like slow, cooker, crock and crockpot which are indicative of the cooking utensils not the food itself (and perhaps not of the problem more generally). It's difficult to determine the rare/common distinction here, but it becomes a little clearer as we look further down the ranking.

In [10]:
# Do the same thing for Mutual Information, instead of  𝜒2 (note that you want the classification version, not the regression version).
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_name_mi = mi.fit_transform(X_train_name,y_train)
X_test_name_mi = mi.transform(X_test_name)

print(X_train_name_mi.shape, X_test_name_mi.shape)

for feat_num in mi.get_support(indices=True):
    print(name_vectorizer.get_feature_names()[feat_num])

(40000, 10) (10000, 10)
beef
cake
casserole
chicken
cooker
crock
pot
roast
salad
slow


Here we see more evidence of MI choosing frequently-occuring features, such as slow.

## Modelling

Build a classifier on the training dataset, and evaluate its Accuracy on the test set. Consider k-NN, and perhaps Naive Bayes or Decision Trees

It’s likely that the dataset is still small enough that you can build a model on the entire feature set (after the CountVectorizer, but before the SelectKBest) without crashing your computer. How well do these models predict the test data, using all of the features?

How does this compare with 1000 features, or just the top 10 features?

Try some different values for the cut-off for SelectKBest — is it possible to improve upon the Accuracy observed for the models which use the entire feature set? Is this more true for some learners than others? Does your choice between χ 2 and Mutual Information make a difference?

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Models to train on 
models = [GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(max_depth=1)]
#          KNeighborsClassifier(n_neighbors=1),
#          KNeighborsClassifier(n_neighbors=5),
#          DecisionTreeClassifier(max_depth=None)]
#          svm.LinearSVC(C=C),
#          svm.SVC(kernel='rbf', gamma=0.7, C=C),
#          svm.SVC(kernel='poly', degree=3, C=C)]

# Model Titles
titles = ['GNB',
          'MNB',
          'one-r']
#          '1-nearest neighbour',
#          '5-nearest neighbour',
#          'Decision Tree']
#          'LinearSVC',
#          'SVM with a cubic kernel',
#          'SVM with an RBF kernel']

# Select Number of Features
k = 1000

# Chi-square
x2 = SelectKBest(chi2, k=k)
x2.fit(X_train_name,y_train)
X_train_x2 = x2.transform(X_train_name)
X_test_x2 = x2.transform(X_test_name)

# Mutual Information 
mi = SelectKBest(score_func=mutual_info_classif, k=k)
mi.fit(X_train_name,y_train)
X_train_mi = mi.transform(X_train_name)
X_test_mi = mi.transform(X_test_name)

# Fit the model and test the model 
Xs = [(X_train_name, X_test_name), (X_train_x2, X_test_x2), (X_train_mi, X_test_mi)]
X_names = ['complete', 'x2', 'mi']
for title, model in zip(titles, models):
    print('\n',title, '(with k=',k,'features):')
    for X_name, X in zip(X_names, Xs):
        X_train_t, X_test_t = X
        # convert the variable into a matrix and train it with the selected model 
        model.fit(X_train_t.todense(), y_train)
        acc = model.score(X_train_t.todense(), y_train)
        print(X_name, 'acc',  acc)


 GNB (with k= 1000 features):
complete acc 0.419725
x2 acc 0.5769
mi acc 0.2231

 MNB (with k= 1000 features):
complete acc 0.756375
x2 acc 0.69515
mi acc 0.70625

 one-r (with k= 1000 features):
complete acc 0.5185
x2 acc 0.5185
mi acc 0.5185


Here, we could observe that Multinomial Naive Bayes is the best model and Mutual Information is the best feature extraction method for the feature name in the training set