In [None]:
import scipy.stats
import scipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

TEST_LOCATION = "./COMP30027_2021_Project2_datasets/recipe_test.csv"
TRAIN_LOCATION = "./COMP30027_2021_Project2_datasets/recipe_train.csv"

def get_data(csv_location: str) -> pd.DataFrame:
    # Change 9999 to np.nan when reading in the data
    data = pd.read_csv(csv_location, header = 0) 
    return data

def create_csv_output(file_name: str, result: np.ndarray):
    output = pd.DataFrame({"duration_label": result})
    output.index += 1
    output.to_csv(file_name + ".csv", index_label = "id")

def get_training(train_loc: str):
    train = get_data(train_loc)
    X = train.iloc[:, :-1]
    y = train.iloc[:, -1]
    return (X, y)

def preprocess_training(split = 0, rs = None):
    (X, y) = get_training(TRAIN_LOCATION)
    X["n_ingredients"] = np.log(X["n_ingredients"])
    X["n_steps"] = np.log(X["n_steps"])
    X = X.loc[:, ["n_ingredients", "n_steps"]]

    if split > 0:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=rs)
    else:
        return (X, y)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    return (X_train, X_test, y_train, y_test)

output = preprocess_training(split=0.2)
print(output[0])

In [None]:
#Loading in the datasets within the zip files based on steps in README
import pickle

BASE_LOCATION = "./COMP30027_2021_Project2_datasets/"
COUNT_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_countvec/"
DOC2VEC50_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_doc2vec50/"
DOC2VEC100_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_doc2vec100/"

#countvecs are essentially a one-hot-encoding of the most popular words.
train_name_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_name_countvectorizer.pkl", "rb"))
train_ingr_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_ingr_countvectorizer.pkl", "rb"))
train_steps_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_steps_countvectorizer.pkl", "rb"))

#bag-of-word sparse matrices. Similar to one-hot-encoding but for all words.
#column = word occurence, row = individual instance
train_name_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_name_vec.npz')
train_ingr_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_ingr_vec.npz')
train_steps_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_steps_vec.npz')

test_name_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_name_vec.npz')
test_ingr_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_ingr_vec.npz')
test_steps_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_steps_vec.npz')

#Doc2Vec representation with 50 features. Think of them as 50 dimensional vectors representing the words/phrases
test_name_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_name_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
test_ingr_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_ingr_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
test_steps_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_steps_doc2vec50.csv", index_col = False, delimiter = ',', header=None)

train_name_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_name_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
train_ingr_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_ingr_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
train_steps_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_steps_doc2vec50.csv", index_col = False, delimiter = ',', header=None)

#Doc2Vec representation with 100 features. Think of them as 100 dimensional vectors representing the words/phrases
test_name_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_ingr_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_steps_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

train_name_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_ingr_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_steps_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
"""
print("---countvec---")
print(train_name_countvec.get_feature_names())
"""
print("---vec---")
print(type(train_name_vec))
print(train_name_vec[0, :]) #words corresponding to an individual instance
#print(train_name_vec)

print("---vec50---")
print(type(test_name_vec50))
print(test_name_vec50)
print("---vec100---")
print(type(test_name_vec100))
print(test_name_vec100)


In [None]:
#Zero-R classifier
from sklearn import dummy

(X, y) = get_training(TRAIN_LOCATION)
zero_r_clf = dummy.DummyClassifier(strategy = "most_frequent")
zero_r_clf.fit(X, y)

test = get_data(TEST_LOCATION)
output = zero_r_clf.predict(test)
create_csv_output("zero-r/zero-r", output)

In [None]:
#SVC
from sklearn import svm

(X, y) = get_training(TRAIN_LOCATION)
X = X.to_numpy()[:, 1:3]
y = y.to_numpy()

C = 1.0  # SVM regularization parameter

models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C, max_iter=10000),
          svm.SVC(kernel='rbf', gamma='auto', C=C))

models = (clf.fit(X, y) for clf in models)

titles = ('SVC-with-linear-kernel',
          'LinearSVC-linear-kernel',
          'SVC-with-RBF-kernel')

X_test = get_data(TEST_LOCATION)
X_test = X_test.to_numpy()[:, 1:3]

for (title, clf) in zip(titles, models):
    print("predicting " + title)
    output = clf.predict(X_test)
    print("creating output csv")
    create_csv_output("SVC/" + title, output) 


In [None]:
#SVC with 
from sklearn import svm

(X, y) = preprocess_training()

temp_steps_vec50 = train_steps_vec50.copy()
temp_steps_vec50.columns = [str(label) + "_steps" for label in temp_steps_vec50.columns]

X = pd.concat([X, temp_steps_vec50], axis = 1, join = 'inner')

C = 1.0  # SVM regularization parameter

model = svm.SVC(kernel='rbf', gamma='auto', C=C)
model = model.fit(X, y)
title = 'SVC-with-RBF-kernel'
  
X_test = get_data(TEST_LOCATION)
X_test = X_test.loc[:, ["n_steps", "n_ingredients"]]

temp_steps_vec50 = test_steps_vec50.copy()
temp_steps_vec50.columns = [str(label) + "_steps" for label in temp_steps_vec50.columns]

X_test = pd.concat([X_test, temp_steps_vec50], axis = 1, join = 'inner')

output = model.predict(X_test)
create_csv_output("SVC-adv/" + title, output) 
"""
for (title, clf) in zip(titles, models):
    print("predicting " + title)
    output = clf.predict(X_test)
    print("creating output csv")
    create_csv_output("SVC-adv/" + title, output) 
"""

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline

(X, y) = preprocess_training()

temp_steps_vec50 = train_ingr_vec50.copy()
temp_steps_vec50.columns = [str(label) + "_steps" for label in temp_steps_vec50.columns]

X = pd.concat([X, temp_steps_vec50], axis = 1, join = 'inner')
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

score_out = []
layer_size = [i for i in range(25, 35)]
for i in range(len(layer_size)):
    print(i)

    pipe = make_pipeline(StandardScaler(), MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes = (layer_size[i]), random_state=42))
    pipe.fit(X_train, y_train)

    score_out.append(pipe.score(X_test, y_test))

print(score_out)
"""
plt.scatter(score_out, alphas)
plt.xlabel("scores for neural networks")
plt.ylabel("alpha score: 10^y")
plt.show()
"""

In [None]:
plt.scatter(layer_size, score_out)
plt.title("scores using adam")
plt.ylabel("scores for neural networks")
plt.xlabel("layer_size")
plt.savefig("./graphs/adam.png", format="png")
plt.show()