# A Title

### Global Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import time

pd.options.display.float_format = "{:,.4f}".format

%load_ext autoreload
%autoreload 2

## Train test split

In [2]:
from sklearn.model_selection import KFold

In [3]:
def split(s1=20, s2=5, rs=42):
    X = pd.read_csv("data/recipe_train.csv", header=0)

    # change split scale here
    kf = KFold(n_splits=s1, shuffle=True, random_state=rs)
    kf.get_n_splits(X)

    # split to X to a(train), b(test)
    for a, b in kf.split(X):

        kf = KFold(n_splits=s2, shuffle=True, random_state=rs)
        kf.get_n_splits(b)
        
        # split again on b to c & d
        for c, d in kf.split(b):

            train_small = X.iloc[b[c]].copy(deep=True)
            train_small['index'] = b[c]
            test_small = X.iloc[b[d]].copy(deep=True)
            test_small['index'] = b[d]

            train_small.to_csv("data/train_small.csv", index=False)
            test_small.to_csv("data/test_small.csv", index=False)

            print(f"Created training set containing {len(train_small.index)} instances")
            print(f"Created test set containing {len(test_small.index)} instances")

            break
        break

In [4]:
split()

Created training set containing 1600 instances
Created test set containing 400 instances


## Engineering

In [5]:
def conc_vec(train_file, test_file, vec, index='index', suffix='.csv'):
    """concatenate doc2vec.csv to the dataset"""
    
    train = pd.read_csv(train_file, header=0, index_col=index)
    test = pd.read_csv(test_file, header=0, index_col=index)
    doc_vec = pd.read_csv(vec, names=[(f'vec_{i+1}') for i in range(50)])

    train = pd.merge(train, doc_vec, left_index=True, right_index=True)
    test = pd.merge(test, doc_vec, left_index=True, right_index=True)

    train.to_csv(train_file[:-4]+suffix, index=False)
    test.to_csv(test_file[:-4]+suffix, index=False)
    
    print(f"Merged {vec} to the dataset, saved to {train_file[:-4]+suffix} & {test_file[:-4]+suffix}")

In [6]:
conc_vec("data/train_small.csv", "data/test_small.csv", "data/train_steps_doc2vec50.csv")

Merged data/train_steps_doc2vec50.csv to the dataset, saved to data/train_small.csv & data/test_small.csv


## Text preprocessing

In [7]:
#### TODO: change verb range to all synonym?
#### e.g. is_verb("peel")

In [8]:
from utilities.preprocessing import extract_verb

In [9]:
extract_verb("data/test_small.csv")
extract_verb("data/train_small.csv")

Processing data/test_small.csv ...
Progress: + + + + +
Steps processed into 11303 verbs, containing 640 unique verbs
Processed data/test_small.csv

Processing data/train_small.csv ...
Progress: + + + + +
Steps processed into 45881 verbs, containing 952 unique verbs
Processed data/train_small.csv



In [10]:
from utilities.verb_vec import verb_vec

In [11]:
verb_vec("data/train_small.csv", "data/test_small.csv", pca_num=50)

Created 50 verb vectors features


## SVM

In [12]:
from sklearn.svm import SVC, LinearSVC

# train
train = pd.read_csv("data/train_small.csv", header=0)

# double check given features
features = [f for f in features if 'v_vec' in f]
print(f"Training SVM on: {features}\n")

# extract features
X_train = train.loc[:, features]
y_train = train['duration_label']

NameError: name 'features' is not defined

In [None]:
C = 1.0  # SVM regularization parameter

models = (SVC(kernel='linear', C=C),
          LinearSVC(C=C, max_iter=2000),
          SVC(kernel='rbf', gamma=0.7, C=C),
          SVC(kernel='poly', degree=3, gamma='auto', C=C))

models = (clf.fit(X_train, y_train) for clf in models)

In [None]:
# predict
test = pd.read_csv("data/test_small.csv", header=0)
X_test = test.loc[:, features]
y_test = test.loc[:, 'duration_label']

predictions = (clf.predict(X_test) for clf in models)

In [None]:
for pre in predictions:
    # test
    score = accuracy_score(y_test, pre)
    print(f"Accuracy : {(score*100):.2f}%")

    precision = precision_score(y_test, pre, average=None, zero_division=0)
    recall = recall_score(y_test, pre, average=None, zero_division=0)
    f1 = f1_score(y_test, pre, average=None, zero_division=0)

    score = pd.DataFrame({'Precision':precision, "Recall":recall, "F_score":f1}, index=[1,2,3])
    print(score)

    matrix = confusion_matrix(y_test, pre)
    matrix = pd.DataFrame(matrix, index=[1,2,3], columns=[1,2,3])
    print("\nConfusion matrix:")
    print(matrix, end='\n\n')

## 1R model

In [None]:
from models.model_1R import train_1R, predict_1R

In [None]:
features = ['n_steps', 'n_ingredients', 'n_verbs']

model = train_1R("data/train_small.csv", features)
predict_1R("data/test_small.csv", model)

## Logistic regression

In [None]:
from models.log_regression import train_log, predict_log

In [None]:
train = pd.read_csv("data/train_small.csv", header=0)
features = [col for col in train.columns if (("n_" in col) and col != 'duration_label')] + [col for col in train.columns if ("vec" in col)]

model = train_log("data/train_small.csv", features)
predict_log("data/test_small.csv", features, model)

### *#placeholder*

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [None]:
from utilities.evaluation import simple_accuracy

In [None]:
simple_accuracy("data/test_small.csv")

In [None]:
from utilities.evaluation import evaluate

In [None]:
evaluate("data/test_small.csv")

## Full Scale Tests

In [None]:
run = 0
if run:
    extract_verb("data/recipe_train.csv", output="data/recipe_train_v.csv")
    extract_verb("data/recipe_test.csv", output="data/recipe_test_v.csv")

In [None]:
conc_vec("data/recipe_train_v.csv", "data/recipe_test_v.csv", "data/train_steps_doc2vec50.csv", index=None)

In [None]:
train = pd.read_csv("data/recipe_train_v_vec.csv", header=0)
features = [col for col in train.columns if (("n_" in col) and col != 'duration_label')] + [col for col in train.columns if ("vec" in col)]

model = train_log("data/recipe_train_v_vec.csv", features)
predict_log("data/recipe_test_v_vec.csv", features, model)

In [None]:
print("\rProgress: - - - - - - -", end='')
print("\rProgress: + - - - - - -", end='')
print("\rProgress: + + - - - - -", end='')
print("\rProgress: + + + - - - -", end='')
print("\rProgress: + + + + - - -", end='')
print("\rProgress: + + + + + - -", end='')
print("\rProgress: + + + + + + -", end='')
print("\rProgress: + + + + + + +")