# A Title

### Global Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import time

pd.options.display.float_format = "{:,.4f}".format

%load_ext autoreload
%autoreload 2

## Train test split

In [2]:
from sklearn.model_selection import KFold

In [43]:
def split(s1=20, s2=5, rs=42):
    X = pd.read_csv("data/recipe_train.csv", header=0)

    # change split scale here
    kf = KFold(n_splits=s1, shuffle=True, random_state=rs)
    kf.get_n_splits(X)

    # split to X to a(train), b(test)
    for a, b in kf.split(X):

        kf = KFold(n_splits=s2, shuffle=True, random_state=rs)
        kf.get_n_splits(b)
        
        # split again on b to c & d
        for c, d in kf.split(b):

            train_small = X.iloc[b[c]].copy(deep=True)
            train_small['index'] = b[c]
            test_small = X.iloc[b[d]].copy(deep=True)
            test_small['index'] = b[d]

            train_small.to_csv("data/train_small.csv", index=False)
            test_small.to_csv("data/test_small.csv", index=False)

            print(f"Created training set containing {len(train_small.index)} instances")
            print(f"Created test set containing {len(test_small.index)} instances")

            break
        break

In [44]:
split()

Created training set containing 1600 instances
Created test set containing 400 instances


## Engineering

In [45]:
def conc_vec(train_file, test_file, vec, index='index', suffix='.csv'):
    """concatenate doc2vec.csv to the dataset"""
    
    train = pd.read_csv(train_file, header=0, index_col=index)
    test = pd.read_csv(test_file, header=0, index_col=index)
    doc_vec = pd.read_csv(vec, names=[(f'vec_{i+1}') for i in range(50)])

    train = pd.merge(train, doc_vec, left_index=True, right_index=True)
    test = pd.merge(test, doc_vec, left_index=True, right_index=True)

    train.to_csv(train_file[:-4]+suffix, index=False)
    test.to_csv(test_file[:-4]+suffix, index=False)
    
    print(f"Merged {vec} to the dataset, saved to {train_file[:-4]+suffix} & {test_file[:-4]+suffix}")

In [46]:
conc_vec("data/train_small.csv", "data/test_small.csv", "data/train_steps_doc2vec50.csv")

Merged data/train_steps_doc2vec50.csv to the dataset, saved to data/train_small.csv & data/test_small.csv


## Text preprocessing

In [47]:
#### TODO: change verb range to all synonym?
#### e.g. is_verb("peel")

In [48]:
from utilities.preprocessing import extract_verb

In [49]:
extract_verb("data/test_small.csv")
extract_verb("data/train_small.csv")

Processing data/test_small.csv ...
Progress: + - - - -

In [50]:
from utilities.verb_vec import verb_vec

In [51]:
verb_vec("data/train_small.csv", "data/test_small.csv", pca_num=50)

## SVM

In [65]:
from sklearn.svm import SVC, LinearSVC

# train
train = pd.read_csv("data/train_small.csv", header=0)

# double check given features
features = [f for f in features if 'v_vec' in f]
print(f"Training SVM on: {features}\n")

# extract features
X_train = train.loc[:, features]
y_train = train['duration_label']

Training SVM on: ['v_vec_1', 'v_vec_2', 'v_vec_3', 'v_vec_4', 'v_vec_5', 'v_vec_6', 'v_vec_7', 'v_vec_8', 'v_vec_9', 'v_vec_10', 'v_vec_11', 'v_vec_12', 'v_vec_13', 'v_vec_14', 'v_vec_15', 'v_vec_16', 'v_vec_17', 'v_vec_18', 'v_vec_19', 'v_vec_20', 'v_vec_21', 'v_vec_22', 'v_vec_23', 'v_vec_24', 'v_vec_25', 'v_vec_26', 'v_vec_27', 'v_vec_28', 'v_vec_29', 'v_vec_30', 'v_vec_31', 'v_vec_32', 'v_vec_33', 'v_vec_34', 'v_vec_35', 'v_vec_36', 'v_vec_37', 'v_vec_38', 'v_vec_39', 'v_vec_40', 'v_vec_41', 'v_vec_42', 'v_vec_43', 'v_vec_44', 'v_vec_45', 'v_vec_46', 'v_vec_47', 'v_vec_48', 'v_vec_49', 'v_vec_50']



In [67]:
C = 1.0  # SVM regularization parameter

models = (SVC(kernel='linear', C=C),
          LinearSVC(C=C, max_iter=2000),
          SVC(kernel='rbf', gamma=0.7, C=C),
          SVC(kernel='poly', degree=3, gamma='auto', C=C))

models = (clf.fit(X_train, y_train) for clf in models)

In [68]:
# predict
test = pd.read_csv("data/test_small.csv", header=0)
X_test = test.loc[:, features]
y_test = test.loc[:, 'duration_label']

predictions = (clf.predict(X_test) for clf in models)

In [69]:
for pre in predictions:
    # test
    score = accuracy_score(y_test, pre)
    print(f"Accuracy : {(score*100):.2f}%")

    precision = precision_score(y_test, pre, average=None, zero_division=0)
    recall = recall_score(y_test, pre, average=None, zero_division=0)
    f1 = f1_score(y_test, pre, average=None, zero_division=0)

    score = pd.DataFrame({'Precision':precision, "Recall":recall, "F_score":f1}, index=[1,2,3])
    print(score)

    matrix = confusion_matrix(y_test, pre)
    matrix = pd.DataFrame(matrix, index=[1,2,3], columns=[1,2,3])
    print("\nConfusion matrix:")
    print(matrix, end='\n\n')

Accuracy : 69.00%
   Precision  Recall  F_score
1     0.6736  0.7104   0.6915
2     0.7037  0.6927   0.6982
3     0.7222  0.5200   0.6047

Confusion matrix:
     1    2   3
1  130   50   3
2   57  133   2
3    6    6  13

Accuracy : 70.00%
   Precision  Recall  F_score
1     0.6802  0.7322   0.7053
2     0.7219  0.7031   0.7124
3     0.6875  0.4400   0.5366

Confusion matrix:
     1    2   3
1  134   46   3
2   55  135   2
3    8    6  11

Accuracy : 56.25%
   Precision  Recall  F_score
1     0.8810  0.2022   0.3289
2     0.5251  0.9792   0.6836
3     0.0000  0.0000   0.0000

Confusion matrix:
    1    2  3
1  37  146  0
2   4  188  0
3   1   24  0

Accuracy : 54.50%
   Precision  Recall  F_score
1     0.5076  0.9180   0.6537
2     0.7353  0.2604   0.3846
3     0.0000  0.0000   0.0000

Confusion matrix:
     1   2  3
1  168  14  1
2  142  50  0
3   21   4  0



## 1R model

In [32]:
from models.model_1R import train_1R, predict_1R

In [33]:
features = ['n_steps', 'n_ingredients', 'n_verbs']

model = train_1R("data/train_small.csv", features)
predict_1R("data/test_small.csv", model)

## Logistic regression

In [34]:
from models.log_regression import train_log, predict_log

In [35]:
train = pd.read_csv("data/train_small.csv", header=0)
features = [col for col in train.columns if (("n_" in col) and col != 'duration_label')] + [col for col in train.columns if ("vec" in col)]

model = train_log("data/train_small.csv", features)
predict_log("data/test_small.csv", features, model)

### *#placeholder*

## Evaluation

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [37]:
from utilities.evaluation import simple_accuracy

In [38]:
simple_accuracy("data/test_small.csv")

Accuracy for 'n_steps_1R_prediction': 60.85%
Accuracy for 'n_ingredients_1R_prediction': 60.60%
Accuracy for 'n_verbs_1R_prediction': 61.65%
Accuracy for 'log_prediction': 73.75%


In [39]:
from utilities.evaluation import evaluate

In [41]:
evaluate("data/test_small.csv")


Now analyzing performance of 'n_steps_1R_prediction'

Accuracy = 60.85%
   Precision  Recall  F_score
1     0.6084  0.5544   0.5802
2     0.6086  0.7252   0.6618
3     0.0000  0.0000   0.0000

Confusion matrix:
     1    2  3
1  494  397  0
2  274  723  0
3   44   68  0


Now analyzing performance of 'n_ingredients_1R_prediction'

Accuracy = 60.60%
   Precision  Recall  F_score
1     0.6184  0.5219   0.5660
2     0.5986  0.7492   0.6655
3     0.0000  0.0000   0.0000

Confusion matrix:
     1    2  3
1  465  426  0
2  250  747  0
3   37   75  0


Now analyzing performance of 'n_verbs_1R_prediction'

Accuracy = 61.65%
   Precision  Recall  F_score
1     0.6215  0.5511   0.5842
2     0.6132  0.7442   0.6724
3     0.0000  0.0000   0.0000

Confusion matrix:
     1    2  3
1  491  400  0
2  255  742  0
3   44   68  0


Now analyzing performance of 'log_prediction'

Accuracy = 73.75%
   Precision  Recall  F_score
1     0.7214  0.7643   0.7422
2     0.7538  0.7492   0.7515
3     0.7231  0.419

## Full Scale Tests

In [None]:
run = 0
if run:
    extract_verb("data/recipe_train.csv", output="data/recipe_train_v.csv")
    extract_verb("data/recipe_test.csv", output="data/recipe_test_v.csv")

In [None]:
conc_vec("data/recipe_train_v.csv", "data/recipe_test_v.csv", "data/train_steps_doc2vec50.csv", index=None)

In [None]:
train = pd.read_csv("data/recipe_train_v_vec.csv", header=0)
features = [col for col in train.columns if (("n_" in col) and col != 'duration_label')] + [col for col in train.columns if ("vec" in col)]

model = train_log("data/recipe_train_v_vec.csv", features)
predict_log("data/recipe_test_v_vec.csv", features, model)

In [None]:
print("\rProgress: - - - - - - -", end='')
print("\rProgress: + - - - - - -", end='')
print("\rProgress: + + - - - - -", end='')
print("\rProgress: + + + - - - -", end='')
print("\rProgress: + + + + - - -", end='')
print("\rProgress: + + + + + - -", end='')
print("\rProgress: + + + + + + -", end='')
print("\rProgress: + + + + + + +")