In [1]:
import scipy.stats
import scipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

TEST_LOCATION = "./COMP30027_2021_Project2_datasets/recipe_test.csv"
TRAIN_LOCATION = "./COMP30027_2021_Project2_datasets/recipe_train.csv"

def preprocess(csv_location: str) -> pd.DataFrame:
    # Change 9999 to np.nan when reading in the data
    data = pd.read_csv(csv_location, header = 0) 
    return data

def create_csv_output(file_name: str, result: np.ndarray):
    output = pd.DataFrame({"duration_label": result})
    output.index += 1
    output.to_csv(file_name + ".csv", index_label = "id")

def get_training(train_loc: str):
    train = preprocess(train_loc)
    X = train.iloc[:, :-1]
    y = train.iloc[:, -1]
    return (X, y)

def convert_to_proper_output(file_name: str):
    old = pd.read_csv(file_name + ".csv", header = 0)
    old.index += 1
    old.to_csv(file_name + "-new.csv", index_label = "id")

In [2]:
#Loading in the datasets within the zip files based on steps in README
import pickle

BASE_LOCATION = "./COMP30027_2021_Project2_datasets/"
COUNT_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_countvec/"
DOC2VEC50_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_doc2vec50/"
DOC2VEC100_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_doc2vec100/"

#countvecs are essentially a one-hot-encoding of the most popular words.
train_name_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_name_countvectorizer.pkl", "rb"))
train_ingr_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_ingr_countvectorizer.pkl", "rb"))
train_steps_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_steps_countvectorizer.pkl", "rb"))

#bag-of-word sparse matrices. Similar to one-hot-encoding but for all words.
#column = word occurence, row = individual instance
train_name_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_name_vec.npz')
train_ingr_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_ingr_vec.npz')
train_steps_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_steps_vec.npz')

test_name_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_name_vec.npz')
test_ingr_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_ingr_vec.npz')
test_steps_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_steps_vec.npz')

#Doc2Vec representation with 50 features. Think of them as 50 dimensional vectors representing the words/phrases
test_name_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_name_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
test_ingr_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_ingr_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
test_steps_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_steps_doc2vec50.csv", index_col = False, delimiter = ',', header=None)

train_name_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_name_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
train_ingr_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_ingr_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
train_steps_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_steps_doc2vec50.csv", index_col = False, delimiter = ',', header=None)

#Doc2Vec representation with 100 features. Think of them as 100 dimensional vectors representing the words/phrases
test_name_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_ingr_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_steps_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

train_name_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_ingr_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_steps_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
"""
print("---countvec---")
print(train_name_countvec.get_feature_names())
"""
print("---vec---")
print(type(train_name_vec))
print(train_name_vec[0, :]) #words corresponding to an individual instance
#print(train_name_vec)

print("---vec50---")
print(type(test_name_vec50))
print(test_name_vec50)
print("---vec100---")
print(type(test_name_vec100))
print(test_name_vec100)


---vec---
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 1976)	1
  (0, 2916)	1
  (0, 3871)	1
  (0, 4437)	1
  (0, 8202)	1
  (0, 9458)	1
  (0, 10597)	1
---vec50---
<class 'pandas.core.frame.DataFrame'>
            0         1         2         3         4         5         6   \
0    -0.074475  0.043355 -0.200122  0.562771  0.051261 -0.283168  0.023642   
1    -0.083797 -0.119397  0.055886  0.459259 -0.129858  0.275230 -0.274989   
2    -0.188450  0.037819 -0.098069 -0.140659 -0.056237 -0.074347  0.172098   
3     0.039180  0.458914  0.201800  0.052737 -0.095670  0.231523  0.064118   
4     0.235192 -0.201566 -0.543494 -0.418995 -0.186713  0.058639 -0.241507   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.083350 -0.252363  0.080970 -0.064834 -0.209142 -0.106377  0.105602   
9996 -0.147893  0.043239  0.147160  0.014753  0.154295 -0.234711  0.132267   
9997 -0.471655 -0.206248 -0.141278  0.144359  0.111215 -0.557390  0.170998   
9998 -0.293525 -0.26355

In [21]:
#Zero-R classifier
from sklearn import dummy

(X, y) = get_training(TRAIN_LOCATION)
zero_r_clf = dummy.DummyClassifier(strategy = "most_frequent")
zero_r_clf.fit(X, y)

test = preprocess(TEST_LOCATION)
output = zero_r_clf.predict(test)
create_csv_output("zero-r/zero-r", output)

In [None]:
#SVC
from sklearn import svm

(X, y) = get_training(TRAIN_LOCATION)
X = X.to_numpy()[:, 1:3]
y = y.to_numpy()

C = 1.0  # SVM regularization parameter

models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C, max_iter=10000),
          svm.SVC(kernel='rbf', gamma='auto', C=C))

models = (clf.fit(X, y) for clf in models)

titles = ('SVC-with-linear-kernel',
          'LinearSVC-linear-kernel',
          'SVC-with-RBF-kernel')

X_test = preprocess(TEST_LOCATION)
X_test = X_test.to_numpy()[:, 1:3]

for (title, clf) in zip(titles, models):
    print("predicting " + title)
    output = clf.predict(X_test)
    print("creating output csv")
    create_csv_output("SVC/" + title, output) 


In [5]:
#SVC with 
from sklearn import svm

(X, y) = get_training(TRAIN_LOCATION)
X = X.loc[:, ["n_steps", "n_ingredients"]]
y = y.to_numpy()

temp_steps_vec50 = train_ingr_vec50.copy()
temp_steps_vec50.columns = [str(label) + "_steps" for label in temp_steps_vec50.columns]

X = pd.concat([X, temp_steps_vec50], axis = 1, join = 'inner')

C = 1.0  # SVM regularization parameter
'''
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C, max_iter=10000),
          svm.SVC(kernel='rbf', gamma='auto', C=C))

models = (clf.fit(X, y) for clf in models)

titles = ('SVC-with-linear-kernel',
          'LinearSVC-linear-kernel',
          'SVC-with-RBF-kernel')
'''

model = svm.SVC(kernel='rbf', gamma='auto', C=C)
model = model.fit(X, y)
title = 'SVC-with-RBF-kernel'

X_test = preprocess(TEST_LOCATION)
X_test = X_test.loc[:, ["n_steps", "n_ingredients"]]

temp_steps_vec50 = test_ingr_vec50.copy()
temp_steps_vec50.columns = [str(label) + "_steps" for label in temp_steps_vec50.columns]

X_test = pd.concat([X_test, temp_steps_vec50], axis = 1, join = 'inner')

output = model.predict(X_test)
create_csv_output("SVC-adv/" + title, output) 
"""
for (title, clf) in zip(titles, models):
    print("predicting " + title)
    output = clf.predict(X_test)
    print("creating output csv")
    create_csv_output("SVC-adv/" + title, output) 
"""

'\nfor (title, clf) in zip(titles, models):\n    print("predicting " + title)\n    output = clf.predict(X_test)\n    print("creating output csv")\n    create_csv_output("SVC-adv/" + title, output) \n'