# A Title

### Global Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
pd.options.display.float_format = "{:,.2f}".format

## Train test split

In [3]:
from sklearn.model_selection import KFold

In [4]:
def split(s1=20, s2=5):
    X = pd.read_csv("data/recipe_train.csv", header=0)

    # change split scale here
    kf = KFold(n_splits=s1, shuffle=True)
    kf.get_n_splits(X)

    # split to X to a(train), b(test)
    for a, b in kf.split(X):

        kf = KFold(n_splits=s2, shuffle=True)
        kf.get_n_splits(b)
        
        # split again on b to c & d
        for c, d in kf.split(b):

            train_small = X.iloc[b[c]].copy(deep=True)
            train_small['index'] = b[c]
            test_small = X.iloc[b[d]].copy(deep=True)
            test_small['index'] = b[d]

            train_small.to_csv("data/train_small.csv", index=False)
            test_small.to_csv("data/test_small.csv", index=False)

            print(f"created training set containing {len(train_small.index)} instances")
            print(f"created test set containing {len(test_small.index)} instances")

            break
        break

In [5]:
split()

created training set containing 1600 instances
created test set containing 400 instances


## Engineering

In [6]:
def conc_vec():
    """concatenate doc2vec.csv to the dataset,
    re-splits the data"""
    
    split()
    train = pd.read_csv("data/train_small.csv", header=0, index_col='index')
    test = pd.read_csv("data/test_small.csv", header=0, index_col='index')
    doc_vec = pd.read_csv("data/train_steps_doc2vec50.csv", names=[(f'vec_{i+1}') for i in range(50)])

    train = pd.merge(train, doc_vec, left_index=True, right_index=True)
    test = pd.merge(test, doc_vec, left_index=True, right_index=True)

    train.to_csv("data/train_small_vec.csv", index=False)
    test.to_csv("data/test_small_vec.csv", index=False)
    
    print("merged doc2vec50 to the dataset")

In [7]:
conc_vec()

created training set containing 1600 instances
created test set containing 400 instances
merged doc2vec50 to the dataset


## Text preprocessing

In [8]:
#### TODO: change verb range to all synonym?
#### e.g. is_verb("peel")

In [9]:
from utilities.preprocessing import preprocess

In [10]:
split()
preprocess("data/test_small.csv")
preprocess("data/train_small.csv")

created training set containing 1600 instances
created test set containing 400 instances


## 1R model

In [11]:
from models.model_1R import n_1R, predict_1R

In [12]:
model = n_1R("data/train_small.csv", rule='steps')
predict_1R("data/test_small.csv", model, name='steps_1R_prediction')
model = n_1R("data/train_small.csv", rule='ingredients')
predict_1R("data/test_small.csv", model, name='ingredients_1R_prediction')

training on data/train_small.csv using rule: 'steps'
predicted data/test_small.csv, saved to column 'steps_1R_prediction'
training on data/train_small.csv using rule: 'ingredients'
predicted data/test_small.csv, saved to column 'ingredients_1R_prediction'


## Logistic regression - doc_vec_50

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
train = pd.read_csv("data/train_small_vec.csv", header=0)
test = pd.read_csv("data/test_small_vec.csv", header=0)

# selecting vector feature
vec = [col for col in train.columns if ("vec" in col)]

X_train = train.loc[:, vec]
y_train = train['duration_label']
X_test = test.loc[:, vec]
y_test = test['duration_label']

clf = LogisticRegression(random_state=42).fit(X_train, y_train)
prediction = clf.predict(X_test)

accuracy_score(y_test, prediction)

0.635

### *#placeholder*

## Evaluation

In [15]:
from utilities.evaluation import simple_accuracy

In [16]:
simple_accuracy("data/test_small.csv")

Accuracy for 'steps_1R_prediction': 58.25%
Accuracy for 'ingredients_1R_prediction': 58.50%


In [17]:
from utilities.evaluation import evaluate

In [18]:
evaluate("data/test_small.csv")


Now analyzing performance of 'steps_1R_prediction'

   Precision  Recall  F_score
1       0.59    0.51     0.55
2       0.58    0.71     0.63
3       0.00    0.00     0.00

Confusion matrix:
    1    2  3
1  96   93  0
2  57  137  0
3   9    8  0


Now analyzing performance of 'ingredients_1R_prediction'

   Precision  Recall  F_score
1       0.57    0.63     0.60
2       0.60    0.59     0.59
3       0.00    0.00     0.00

Confusion matrix:
     1    2  3
1  119   70  0
2   79  115  0
3    9    8  0

