In [46]:
# Load library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 

from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
%matplotlib inline

## Import Data 

In [4]:
# Create URL
test_csv_2 = "C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_test.csv"
train_csv_2 = "C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_train.csv"

# Load Dataset 
test_df = pd.read_csv(test_csv_2)
train_df = pd.read_csv(train_csv_2)

In [6]:
X = train_df[['n_steps', 'n_ingredients']]
y = train_df['duration_label']

# Data Modelling

### Decision Tree and One_R

In [13]:
one_r = DecisionTreeClassifier(max_depth=1)
one_r.fit(X, y)
dt = DecisionTreeClassifier(max_depth=None)
dt.fit(X, y)

DecisionTreeClassifier()

In [14]:
one_r_acc = one_r.score(X,y)
dt_acc = dt.score(X,y)
print("1-R accuracy: {}; DT accuracy: {}".format(one_r_acc, dt_acc))

1-R accuracy: 0.62165; DT accuracy: 0.644125


### KNN

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


#try to change C to 1000 or 0.001
C = 1.0  # SVM regularization parameter

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
knn_acc = knn.score(X,y)
print("kNN accuracy: {}".format(knn_acc))

kNN accuracy: 0.598675


### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X,y)
log_reg_acc = log_reg.score(X,y)
print("Logistic Regression accuracy: {}".format(log_reg_acc))

Logistic Regression accuracy: 0.637025


### Chi-sq

In [7]:
from sklearn.feature_selection import SelectKBest, chi2

x2 = SelectKBest(chi2, k=10)

X_train_x2 = x2.fit_transform(X_train,y)
X_test_x2 = x2.transform(X_test)

for feat_num in x2.get_support(indices=True):
    print(vectoriser.get_feature_names()[feat_num])

beef
cake
casserole
cooker
crock
crockpot
pot
roast
salad
slow


In [13]:
k=1000
x2 = SelectKBest(chi2, k=k)
x2.fit(X_train,y)
X_train_x2 = x2.transform(X_train)
X_test_x2 = x2.transform(X_test)

### Mutual Information

In [34]:
k=1000
mi = SelectKBest(score_func=mutual_info_classif, k=k)
mi.fit(X_train,y)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_test)

<10000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 28138 stored elements in Compressed Sparse Row format>

### Different Models

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

models = [GaussianNB(),
          MultinomialNB(),
          KNeighborsClassifier(n_neighbors=5),
          DecisionTreeClassifier(max_depth=None)]
#          svm.LinearSVC(C=C),
#          svm.SVC(kernel='rbf', gamma=0.7, C=C),
#          svm.SVC(kernel='poly', degree=3, C=C)]
titles = ['GNB',
          'MNB',
          '5-nearest neighbour',
          'Decision Tree']
#          'LinearSVC',
#          'SVM with a cubic kernel',
#          'SVM with an RBF kernel']

Xs = [(X_train, X_test), (X_train_x2, X_test_x2), (X_train_mi, X_test_mi)]
X_names = ['complete', 'x2', 'mi']
for title, model in zip(titles, models):
    print('\n',title, '(with k=',k,'features):')
    for X_name, X in zip(X_names, Xs):
        X_train_t, X_test_t = X
        model.fit(X_train_t.todense(), y)
        acc = model.score(X_train_t.todense(), y)
        print(X_name, 'acc',  acc)

In [26]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
models = [GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LogisticRegression()]
titles = ['GNB',
          'MNB',
          'Decision Tree',
          'KNN',
          'Logistic Regression']

for title, model in zip(titles, models):
    model.fit(X,y)
    acc = model.score(X,y)
    print(title, "Accuracy:",acc)

GNB Accuracy: 0.619575
MNB Accuracy: 0.523575
Decision Tree Accuracy: 0.644125
KNN Accuracy: 0.598675
Logistic Regression Accuracy: 0.637025


## Model Testing

In [69]:
import pickle
name_vec = pickle.load(open("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\train_name_countvectorizer.pkl", "rb"))
name_dict = name_vec.vocabulary_

step_vec = pickle.load(open("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\train_steps_countvectorizer.pkl", "rb"))
step_dict = steps_vec.vocabulary_

ingr_vec = pickle.load(open("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\train_ingr_countvectorizer.pkl", "rb"))
ingr_dict = ingr_vec.vocabulary_



### Select Features with CountVec and Mutual Information

In [66]:
# Extract features from 'name' with CountVectoriser
vectoriser = CountVectorizer()

name_train = train_df['name'].values
name_test = test_df['name'].values
X_train_name = name_vec.fit_transform(name_train)
X_test_name = name_vec.transform(name_test)

k=1000

In [67]:
# Feature Selection with Mutual Information for 'name'
mi = SelectKBest(score_func=mutual_info_classif, k=k)
name_train_mi = mi.fit_transform(X_train_name,y)
name_test_mi = mi.transform(X_test_name)

In [70]:
# Extract features from 'steps' with CountVectoriser
step_train = train_df['steps'].values
step_test = test_df['steps'].values
X_train_step = step_vec.fit_transform(step_train)
X_test_step = step_vec.transform(step_test)

In [71]:
# Feature Selection with Mutual Information for 'steps'
mi = SelectKBest(score_func=mutual_info_classif, k=k)
step_train_mi = mi.fit_transform(X_train_step,y)
step_test_mi = mi.transform(X_test_step)

In [72]:
# Extract features from 'ingredients' with CountVectoriser
ingr_train = train_df['ingredients'].values
ingr_test = test_df['ingredients'].values
X_train_ingr = ingr_vec.fit_transform(ingr_train)
X_test_ingr = ingr_vec.transform(ingr_test)

In [73]:
# Feature Selection with Mutual Information for 'steps'
mi = SelectKBest(score_func=mutual_info_classif, k=k)
ingr_train_mi = mi.fit_transform(X_train_ingr,y)
ingr_test_mi = mi.transform(X_test_ingr)

### Merging Features

In [74]:
new_train_df = train_df.copy()
new_train_df = new_train_df[['n_steps','n_ingredients']]

In [75]:
name_matrix = name_train_mi.todense()
name_list = name_matrix.tolist()
name_df = pd.DataFrame(name_list)
name_df = name_df.add_prefix('name_')

step_matrix = step_train_mi.todense()
step_list = step_matrix.tolist()
step_df = pd.DataFrame(step_list)
step_df = step_df.add_prefix('step_')

ingr_matrix = ingr_train_mi.todense()
ingr_list = ingr_matrix.tolist()
ingr_df = pd.DataFrame(ingr_list)
ingr_df = ingr_df.add_prefix('ingr_')

In [76]:
features_train = new_train_df.join(name_df)
features_train = features_train.join(step_df)
features_train = features_train.join(ingr_df)

In [77]:
features_train.shape

(40000, 3002)

### Model Training with Logistic Regression

In [78]:
log = LogisticRegression()
log.fit(features_train, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [79]:
log.score(features_train, y)

0.8285

### Model Testing

In [80]:
new_test_df = test_df.copy()
new_test_df = new_test_df[['n_steps','n_ingredients']]

name_matrix_test = name_test_mi.todense()
name_list_test = name_matrix_test.tolist()
name_df_test = pd.DataFrame(name_list_test)
name_df_test = name_df_test.add_prefix('name_')

step_matrix_test = step_test_mi.todense()
step_list_test = step_matrix_test.tolist()
step_df_test = pd.DataFrame(step_list_test)
step_df_test = step_df_test.add_prefix('step_')

ingr_matrix_test = ingr_test_mi.todense()
ingr_list_test = ingr_matrix_test.tolist()
ingr_df_test = pd.DataFrame(ingr_list_test)
ingr_df_test = ingr_df_test.add_prefix('ingr_')

features_test = new_test_df.join(name_df_test)
features_test = features_test.join(step_df_test)
features_test = features_test.join(ingr_df_test)

## Create csv file for Kaggle

In [81]:
ybar = log.predict(features_test)

In [82]:
test_id = test_df.index
data = {'id': test_id+1, 'duration_label': ybar}
df = pd.DataFrame(data)
df.to_csv('predict.csv', index=False)