# A Title

### Global Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

pd.options.display.float_format = "{:,.2f}".format

%load_ext autoreload
%autoreload 2

## Train test split

In [2]:
from sklearn.model_selection import KFold

In [3]:
def split(s1=20, s2=5, rs=42):
    X = pd.read_csv("data/recipe_train.csv", header=0)

    # change split scale here
    kf = KFold(n_splits=s1, shuffle=True, random_state=rs)
    kf.get_n_splits(X)

    # split to X to a(train), b(test)
    for a, b in kf.split(X):

        kf = KFold(n_splits=s2, shuffle=True, random_state=rs)
        kf.get_n_splits(b)
        
        # split again on b to c & d
        for c, d in kf.split(b):

            train_small = X.iloc[b[c]].copy(deep=True)
            train_small['index'] = b[c]
            test_small = X.iloc[b[d]].copy(deep=True)
            test_small['index'] = b[d]

            train_small.to_csv("data/train_small.csv", index=False)
            test_small.to_csv("data/test_small.csv", index=False)

            print(f"Created training set containing {len(train_small.index)} instances")
            print(f"Created test set containing {len(test_small.index)} instances")

            break
        break

In [4]:
split()

Created training set containing 1600 instances
Created test set containing 400 instances


## Engineering

In [5]:
def conc_vec():
    """concatenate doc2vec.csv to the dataset"""
    
    train = pd.read_csv("data/train_small.csv", header=0, index_col='index')
    test = pd.read_csv("data/test_small.csv", header=0, index_col='index')
    doc_vec = pd.read_csv("data/train_steps_doc2vec50.csv", names=[(f'vec_{i+1}') for i in range(50)])

    train = pd.merge(train, doc_vec, left_index=True, right_index=True)
    test = pd.merge(test, doc_vec, left_index=True, right_index=True)

    train.to_csv("data/train_small_vec.csv", index=False)
    test.to_csv("data/test_small_vec.csv", index=False)
    
    print("merged doc2vec50 to the dataset")

In [6]:
conc_vec()

merged doc2vec50 to the dataset


## Text preprocessing

In [7]:
#### TODO: change verb range to all synonym?
#### e.g. is_verb("peel")

In [8]:
from utilities.preprocessing import extract_verb

In [9]:
split()
extract_verb("data/test_small.csv")
extract_verb("data/train_small.csv")

Created training set containing 1600 instances
Created test set containing 400 instances
Steps processed into 6776 verbs, containing 464 unique verbs
Processed data/test_small.csv

Steps processed into 27344 verbs, containing 726 unique verbs
Processed data/train_small.csv



## 1R model

In [10]:
from models.model_1R import train_1R, predict_1R

In [11]:
features = ['n_steps', 'n_ingredients', 'n_verbs']

model = train_1R("data/train_small.csv", features)
predict_1R("data/test_small.csv", model)

Training 1R model on: ['n_steps', 'n_ingredients', 'n_verbs']

Predicted data/test_small.csv with ['n_steps', 'n_ingredients', 'n_verbs']



## Logistic regression

In [12]:
from models.log_regression import train_log, predict_log

In [14]:
train = pd.read_csv("data/train_small_vec.csv", header=0)
features = [col for col in train.columns if (("n_" in col) and col != 'duration_label')] + [col for col in train.columns if ("vec" in col)]

model = train_log("data/train_small_vec.csv", features)
predict_log("data/test_small_vec.csv", features, model)

Training logistic regression on: ['n_steps', 'n_ingredients', 'vec_1', 'vec_2', 'vec_3', 'vec_4', 'vec_5', 'vec_6', 'vec_7', 'vec_8', 'vec_9', 'vec_10', 'vec_11', 'vec_12', 'vec_13', 'vec_14', 'vec_15', 'vec_16', 'vec_17', 'vec_18', 'vec_19', 'vec_20', 'vec_21', 'vec_22', 'vec_23', 'vec_24', 'vec_25', 'vec_26', 'vec_27', 'vec_28', 'vec_29', 'vec_30', 'vec_31', 'vec_32', 'vec_33', 'vec_34', 'vec_35', 'vec_36', 'vec_37', 'vec_38', 'vec_39', 'vec_40', 'vec_41', 'vec_42', 'vec_43', 'vec_44', 'vec_45', 'vec_46', 'vec_47', 'vec_48', 'vec_49', 'vec_50']

Predicted data/test_small_vec.csv, saved to column 'log_prediction'



### *#placeholder*

## Evaluation

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [16]:
from utilities.evaluation import simple_accuracy

In [17]:
simple_accuracy("data/test_small.csv")

Accuracy for 'n_steps_1R_prediction': 57.75%
Accuracy for 'n_ingredients_1R_prediction': 59.75%
Accuracy for 'n_verbs_1R_prediction': 61.25%


In [18]:
from utilities.evaluation import evaluate

In [19]:
evaluate("data/test_small.csv")


Now analyzing performance of 'n_steps_1R_prediction'

   Precision  Recall  F_score
1       0.58    0.56     0.57
2       0.57    0.67     0.62
3       0.00    0.00     0.00

Confusion matrix:
     1    2  3
1  102   81  0
2   63  129  0
3   10   15  0


Now analyzing performance of 'n_ingredients_1R_prediction'

   Precision  Recall  F_score
1       0.65    0.49     0.56
2       0.57    0.78     0.66
3       0.00    0.00     0.00

Confusion matrix:
    1    2  3
1  89   94  0
2  42  150  0
3   6   19  0


Now analyzing performance of 'n_verbs_1R_prediction'

   Precision  Recall  F_score
1       0.63    0.56     0.59
2       0.60    0.74     0.66
3       0.00    0.00     0.00

Confusion matrix:
     1    2  3
1  103   80  0
2   50  142  0
3   11   14  0



In [22]:
evaluate("data/test_small_vec.csv")


Now analyzing performance of 'log_prediction'

Accuracy = 68.25%
   Precision  Recall  F_score
1       0.71    0.69     0.70
2       0.68    0.71     0.70
3       0.48    0.40     0.43

Confusion matrix:
     1    2   3
1  126   53   4
2   48  137   7
3    3   12  10

