In [None]:
import lightgbm as lgb
import scipy.stats
import scipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

TEST_LOCATION = "./COMP30027_2021_Project2_datasets/recipe_test.csv"
TRAIN_LOCATION = "./COMP30027_2021_Project2_datasets/recipe_train.csv"

def get_data(csv_location: str) -> pd.DataFrame:
    # Change 9999 to np.nan when reading in the data
    data = pd.read_csv(csv_location, header = 0) 
    return data

def create_csv_output(file_name: str, result: np.ndarray):
    output = pd.DataFrame({"duration_label": result})
    output.index += 1
    output.to_csv(file_name + ".csv", index_label = "id")

def get_training(train_loc: str):
    train = get_data(train_loc)
    X = train.iloc[:, :-1]
    y = train.iloc[:, -1].astype("int32")
    return (X, y)

def preprocess_training(split = 0, rs = None):
    (X, y) = get_training(TRAIN_LOCATION)
    X["n_ingredients"] = np.log(X["n_ingredients"])
    X["n_steps"] = np.log(X["n_steps"])
    X = X.loc[:, ["n_ingredients", "n_steps"]]

    if split > 0:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=rs)
    else:
        return (X, y)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    return (X_train, X_test, y_train, y_test)

def preprocess_testing():
    X = get_data(TEST_LOCATION)
    X["n_ingredients"] = np.log(X["n_ingredients"])
    X["n_steps"] = np.log(X["n_steps"])
    X = X.loc[:, ["n_ingredients", "n_steps"]]
    return X

In [None]:
#Loading in the datasets within the zip files based on steps in README
import pickle

BASE_LOCATION = "./COMP30027_2021_Project2_datasets/"
COUNT_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_countvec/"
DOC2VEC50_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_doc2vec50/"
DOC2VEC100_VEC_LOCATION = BASE_LOCATION + "recipe_text_features_doc2vec100/"

#countvecs are essentially a one-hot-encoding of the most popular words.
train_name_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_name_countvectorizer.pkl", "rb"))
train_ingr_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_ingr_countvectorizer.pkl", "rb"))
train_steps_countvec = pickle.load(open(COUNT_VEC_LOCATION + "train_steps_countvectorizer.pkl", "rb"))

#bag-of-word sparse matrices. Similar to one-hot-encoding but for all words.
#column = word occurence, row = individual instance
train_name_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_name_vec.npz')
train_ingr_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_ingr_vec.npz')
train_steps_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'train_steps_vec.npz')

test_name_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_name_vec.npz')
test_ingr_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_ingr_vec.npz')
test_steps_vec = scipy.sparse.load_npz(COUNT_VEC_LOCATION + 'test_steps_vec.npz')

#Doc2Vec representation with 50 features. Think of them as 50 dimensional vectors representing the words/phrases
test_name_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_name_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
test_ingr_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_ingr_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
test_steps_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "test_steps_doc2vec50.csv", index_col = False, delimiter = ',', header=None)

train_name_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_name_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
train_ingr_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_ingr_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
train_steps_vec50 = pd.read_csv(DOC2VEC50_VEC_LOCATION + "train_steps_doc2vec50.csv", index_col = False, delimiter = ',', header=None)

#Doc2Vec representation with 100 features. Think of them as 100 dimensional vectors representing the words/phrases
test_name_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_ingr_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_steps_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "test_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

train_name_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_ingr_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_ingr_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_steps_vec100 = pd.read_csv(DOC2VEC100_VEC_LOCATION + "train_steps_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

def preproc_vec50_kbest(k_val = 50, split = 0, rs = None):
    (X, y) = preprocess_training()
    X_test = preprocess_testing()
    scaler = MinMaxScaler()
    temp_steps_vec50 = train_steps_vec50.copy()
    temp_ingr_vec50 = train_ingr_vec50.copy()
    temp_name_vec50 = train_name_vec50.copy()

    temp_steps_vec50.columns = [str(label) + "_steps" for label in temp_steps_vec50.columns]
    temp_ingr_vec50.columns = [str(label) + "_ingrs" for label in temp_ingr_vec50.columns]
    temp_name_vec50.columns = [str(label) + "_name" for label in temp_name_vec50.columns]

    X_train_50 = pd.concat([X, temp_name_vec50, temp_ingr_vec50, temp_steps_vec50], axis = 1, join = 'inner')

    if split > 0:
        X_train, X_test, y_train, y_test = train_test_split(X_train_50, y, test_size = split, random_state = rs)
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        chi = SelectKBest(chi2, k=k_val)
        X_train = chi.fit_transform(X_train, y_train)
        X_test = chi.transform(X_test)
        
        return (X_train, X_test, y_train, y_test)
    else:
        scaler.fit(X_train_50)
        X_train_50 = scaler.transform(X_train_50)
        X_test = scaler.transform(X_test)
        k_best = SelectKBest(chi2, k=k_val).fit(X_train_50, y)
        X_train_50 = k_best.transform(X_train_50, y)
        X_test = k_best.transform(X_test)
        return (X_train_50, X_test, y, None)

# output = preproc_vec50_kbest()
# print(output[0])

In [None]:
from sklearn.pipeline import make_pipeline

(X, y) = get_training(TRAIN_LOCATION)
X = X.loc[:, ["n_steps", "n_ingredients"]]

temp_train_name_vec50 = train_name_vec50.copy()
temp_train_name_vec50.columns = [str(label) + "_name" for label in temp_train_name_vec50.columns]

temp_train_ingr_vec50 = train_ingr_vec50.copy()
temp_train_ingr_vec50.columns = [str(label) + "_ingrs" for label in temp_train_ingr_vec50.columns]

temp_train_steps_vec50 = train_steps_vec50.copy()
temp_train_steps_vec50.columns = [str(label) + "_steps" for label in temp_train_name_vec50.columns]

X_combined = pd.concat([X, temp_train_name_vec50, temp_train_ingr_vec50, temp_train_steps_vec50], axis=1, join='inner')

# You need to subtract 1 from y because it needs to be 0, 1, 2 instead of 1, 2, 3
X_50_train, X_50_test, y_50_train, y_50_test = train_test_split(X_combined, y - 1, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_50_train)
X_50_train = scaler.transform(X_50_train)
X_50_test = scaler.transform(X_50_test)

In [None]:
train_data = lgb.Dataset(X_50_train, label=y_50_train)
eval_data = lgb.Dataset(X_50_test, label=y_50_test)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
}

In [None]:
import pickle

gbm = lgb.train(params, train_data, valid_sets=eval_data, num_boost_round=100, verbose_eval=5)
pickle.dump(gbm, open("models/LightGBM-GBDT-50.sav", "wb"))

In [None]:
preds = gbm.predict(X_50_test)

y_pred = list()
for x in preds:
    y_pred.append(np.argmax(x))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_50_test, y_pred)

In [None]:
X_test = get_data(TEST_LOCATION)
test = get_data(TEST_LOCATION)
X_test = X_test.loc[:, ["n_steps", "n_ingredients"]]

temp_test_name_vec50 = test_name_vec50.copy()
temp_test_name_vec50.columns = [str(label) + "_name" for label in temp_test_name_vec50.columns]

temp_test_ingr_vec50 = test_ingr_vec50.copy()
temp_test_ingr_vec50.columns = [str(label) + "_ingrs" for label in temp_test_ingr_vec50.columns]

temp_test_steps_vec50 = test_steps_vec50.copy()
temp_test_steps_vec50.columns = [str(label) + "_steps" for label in temp_test_name_vec50.columns]

X_test = pd.concat([X_test, temp_test_name_vec50, temp_test_ingr_vec50, temp_test_steps_vec50], axis=1, join='inner')


In [None]:
scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
submission_preds = gbm.predict(X_test)

In [None]:
submission_y_preds = list()
for x in submission_preds:
    submission_y_preds.append(np.argmax(x))

In [None]:
create_csv_output("DecisionTree/LightGBM-GBDT-50", (np.array(submission_y_preds) + 1).astype("float32") ) 