# A Title

### Global Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import time

pd.options.display.float_format = "{:,.4f}".format

%load_ext autoreload
%autoreload 2

## Train test split

In [2]:
from sklearn.model_selection import KFold

In [3]:
def split(s1=20, s2=5, rs=42):
    X = pd.read_csv("data/recipe_train.csv", header=0)

    # change split scale here
    kf = KFold(n_splits=s1, shuffle=True, random_state=rs)
    kf.get_n_splits(X)

    # split to X to a(train), b(test)
    for a, b in kf.split(X):

        kf = KFold(n_splits=s2, shuffle=True, random_state=rs)
        kf.get_n_splits(b)
        
        # split again on b to c & d
        for c, d in kf.split(b):

            train_small = X.iloc[b[c]].copy(deep=True)
            train_small['index'] = b[c]
            test_small = X.iloc[b[d]].copy(deep=True)
            test_small['index'] = b[d]

            train_small.to_csv("data/train_small.csv", index=False)
            test_small.to_csv("data/test_small.csv", index=False)

            print(f"Created training set containing {len(train_small.index)} instances")
            print(f"Created test set containing {len(test_small.index)} instances")

            break
        break

In [4]:
split()

Created training set containing 1600 instances
Created test set containing 400 instances


## Engineering

In [5]:
def conc_vec(train_file, test_file, vec, index='index', suffix='.csv'):
    """concatenate doc2vec.csv to the dataset"""
    
    train = pd.read_csv(train_file, header=0, index_col=index)
    test = pd.read_csv(test_file, header=0, index_col=index)
    doc_vec = pd.read_csv(vec, names=[(f'vec_{i+1}') for i in range(50)])

    train = pd.merge(train, doc_vec, left_index=True, right_index=True)
    test = pd.merge(test, doc_vec, left_index=True, right_index=True)

    train.to_csv(train_file[:-4]+suffix, index=False)
    test.to_csv(test_file[:-4]+suffix, index=False)
    
    print(f"Merged {vec} to the dataset, saved to {train_file[:-4]+suffix} & {test_file[:-4]+suffix}")

In [6]:
conc_vec("data/train_small.csv", "data/test_small.csv", "data/train_steps_doc2vec50.csv")

Merged data/train_steps_doc2vec50.csv to the dataset, saved to data/train_small.csv & data/test_small.csv


## Text preprocessing

In [7]:
#### TODO: change verb range to all synonym?
#### e.g. is_verb("peel")

In [8]:
from utilities.preprocessing import extract_verb

In [10]:
extract_verb("data/test_small.csv")
extract_verb("data/train_small.csv")

Processing data/test_small.csv ...
Progress: + + + + +
Steps processed into 11303 verbs, containing 640 unique verbs
Processed data/test_small.csv

Processing data/train_small.csv ...
Progress: + + + + +
Steps processed into 45881 verbs, containing 952 unique verbs
Processed data/train_small.csv



In [11]:
from utilities.verb_vec import verb_vec

In [12]:
verb_vec("data/train_small.csv", "data/test_small.csv", pca_num=50)

Created 50 verb vectors feaatures


## 1R model

In [13]:
from models.model_1R import train_1R, predict_1R

In [14]:
features = ['n_steps', 'n_ingredients', 'n_verbs']

model = train_1R("data/train_small.csv", features)
predict_1R("data/test_small.csv", model)

Training 1R model on: ['n_steps', 'n_ingredients', 'n_verbs']

Predicted data/test_small.csv with ['n_steps', 'n_ingredients', 'n_verbs']



## Logistic regression

In [15]:
from models.log_regression import train_log, predict_log

In [16]:
train = pd.read_csv("data/train_small.csv", header=0)
features = [col for col in train.columns if (("n_" in col) and col != 'duration_label')] + [col for col in train.columns if ("vec" in col)]

model = train_log("data/train_small.csv", features)
predict_log("data/test_small.csv", features, model)

Training logistic regression on: ['n_steps', 'n_ingredients', 'n_verbs', 'vec_1', 'vec_2', 'vec_3', 'vec_4', 'vec_5', 'vec_6', 'vec_7', 'vec_8', 'vec_9', 'vec_10', 'vec_11', 'vec_12', 'vec_13', 'vec_14', 'vec_15', 'vec_16', 'vec_17', 'vec_18', 'vec_19', 'vec_20', 'vec_21', 'vec_22', 'vec_23', 'vec_24', 'vec_25', 'vec_26', 'vec_27', 'vec_28', 'vec_29', 'vec_30', 'vec_31', 'vec_32', 'vec_33', 'vec_34', 'vec_35', 'vec_36', 'vec_37', 'vec_38', 'vec_39', 'vec_40', 'vec_41', 'vec_42', 'vec_43', 'vec_44', 'vec_45', 'vec_46', 'vec_47', 'vec_48', 'vec_49', 'vec_50', 'v_vec_1', 'v_vec_2', 'v_vec_3', 'v_vec_4', 'v_vec_5', 'v_vec_6', 'v_vec_7', 'v_vec_8', 'v_vec_9', 'v_vec_10', 'v_vec_11', 'v_vec_12', 'v_vec_13', 'v_vec_14', 'v_vec_15', 'v_vec_16', 'v_vec_17', 'v_vec_18', 'v_vec_19', 'v_vec_20', 'v_vec_21', 'v_vec_22', 'v_vec_23', 'v_vec_24', 'v_vec_25', 'v_vec_26', 'v_vec_27', 'v_vec_28', 'v_vec_29', 'v_vec_30', 'v_vec_31', 'v_vec_32', 'v_vec_33', 'v_vec_34', 'v_vec_35', 'v_vec_36', 'v_vec_37', '

### *#placeholder*

## Evaluation

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [18]:
from utilities.evaluation import simple_accuracy

In [19]:
simple_accuracy("data/test_small.csv")

Accuracy for 'n_steps_1R_prediction': 57.75%
Accuracy for 'n_ingredients_1R_prediction': 59.75%
Accuracy for 'n_verbs_1R_prediction': 58.75%
Accuracy for 'log_prediction': 71.75%


In [20]:
from utilities.evaluation import evaluate

In [21]:
evaluate("data/test_small.csv")


Now analyzing performance of 'n_steps_1R_prediction'

Accuracy = 57.75%
   Precision  Recall  F_score
1     0.5829  0.5574   0.5698
2     0.5733  0.6719   0.6187
3     0.0000  0.0000   0.0000

Confusion matrix:
     1    2  3
1  102   81  0
2   63  129  0
3   10   15  0


Now analyzing performance of 'n_ingredients_1R_prediction'

Accuracy = 59.75%
   Precision  Recall  F_score
1     0.6496  0.4863   0.5563
2     0.5703  0.7812   0.6593
3     0.0000  0.0000   0.0000

Confusion matrix:
    1    2  3
1  89   94  0
2  42  150  0
3   6   19  0


Now analyzing performance of 'n_verbs_1R_prediction'

Accuracy = 58.75%
   Precision  Recall  F_score
1     0.5892  0.5956   0.5924
2     0.5860  0.6562   0.6192
3     0.0000  0.0000   0.0000

Confusion matrix:
     1    2  3
1  109   74  0
2   66  126  0
3   10   15  0


Now analyzing performance of 'log_prediction'

Accuracy = 71.75%
   Precision  Recall  F_score
1     0.7341  0.6940   0.7135
2     0.7136  0.7396   0.7263
3     0.6429  0.7200   

## Full Scale Tests

In [None]:
run = 0
if run:
    extract_verb("data/recipe_train.csv", output="data/recipe_train_v.csv")
    extract_verb("data/recipe_test.csv", output="data/recipe_test_v.csv")

In [None]:
conc_vec("data/recipe_train_v.csv", "data/recipe_test_v.csv", "data/train_steps_doc2vec50.csv", index=None)

In [None]:
train = pd.read_csv("data/recipe_train_v_vec.csv", header=0)
features = [col for col in train.columns if (("n_" in col) and col != 'duration_label')] + [col for col in train.columns if ("vec" in col)]

model = train_log("data/recipe_train_v_vec.csv", features)
predict_log("data/recipe_test_v_vec.csv", features, model)

In [None]:
print("\rProgress: - - - - - - -", end='')
print("\rProgress: + - - - - - -", end='')
print("\rProgress: + + - - - - -", end='')
print("\rProgress: + + + - - - -", end='')
print("\rProgress: + + + + - - -", end='')
print("\rProgress: + + + + + - -", end='')
print("\rProgress: + + + + + + -", end='')
print("\rProgress: + + + + + + +")