In [1]:
#Import Libraries
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, pairwise
import nltk
from nltk.tokenize import word_tokenize
import os
import spacy
import gensim
import logging
from itertools import permutations
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from scipy import spatial
import random
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt

In [2]:
#Library Configs
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load('en_core_web_sm')

In [3]:
# Read train/test data
#file = open("/Users/anishakabir/ERSP/train.json", "r")
#train_data = json.loads(file.read())

#file = open("/Users/anishakabir/ERSP/test.json", "r")
#test_data = json.loads(file.read())

In [4]:
import json
import os

os.chdir('/Users/anishakabir/Downloads')

# Reading the json as a dict
with open('cleaned.json') as json_data:
    wikihow = json.load(json_data)

In [5]:
key_list = list(wikihow.keys())

In [6]:
# printing by key number
# wikihow[key_list[0]]

In [7]:
# number of articles in dataset (removed singleton and null titles)
len(key_list)

202028

In [8]:
num_instructions = 0
for x in range(len(key_list)):
    num_instructions+=len(wikihow[key_list[x]])

In [9]:
# total number of instructions
num_instructions

1374548

In [10]:
# average number instructions per article
num_instructions/len(key_list)

6.803749975250955

In [11]:
def instruction_list(wiki, end: int): # uses article number  to start and end at
    instructions = list()
    for x in range(0, end):
        for s in range(len(wiki[key_list[x]])):
            instructions.append(wiki[key_list[x]][s])
    return instructions

# take a subest of data containing first 1000 articles; used for training Doc2Vec and split for training/testing
instruction_set = instruction_list(wikihow, 100)

In [12]:
# Doc2Vec Model
# reads and preprocesses text (tokenize text into words, remove punctuation, lowercase, etc) for gensim doc2vec model
# lyst is list of each paragraph/step per article (like sample)
# corpus = collection of documents (in this case collection of instruction paragraphs)
def read_corpus(lyst, tokens_only=False):
    for i, line in enumerate(lyst):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = read_corpus(instruction_set)

In [13]:
# training the doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=84, min_count=2, epochs=40) # not sure about what the vector size should be and other parameters

model.build_vocab(train_corpus)

2020-12-06 23:44:20,859 : INFO : collecting all words and their counts
2020-12-06 23:44:20,860 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-12-06 23:44:20,969 : INFO : collected 7140 word types and 724 unique tags from a corpus of 724 examples and 63640 words
2020-12-06 23:44:20,970 : INFO : Loading a fresh vocabulary
2020-12-06 23:44:20,978 : INFO : effective_min_count=2 retains 3753 unique words (52% of original 7140, drops 3387)
2020-12-06 23:44:20,979 : INFO : effective_min_count=2 leaves 60253 word corpus (94% of original 63640, drops 3387)
2020-12-06 23:44:20,991 : INFO : deleting the raw counts dictionary of 7140 items
2020-12-06 23:44:20,992 : INFO : sample=0.001 downsamples 51 most-common words
2020-12-06 23:44:20,993 : INFO : downsampling leaves estimated 44695 word corpus (74.2% of prior 60253)
2020-12-06 23:44:21,002 : INFO : estimated required memory for 3753 words and 84 dimensions: 4641780 bytes
2020-12-06 23:44:21,003 : INFO : res

In [14]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) # train model on train_corpus

2020-12-06 23:44:21,775 : INFO : training model with 3 workers on 3753 vocabulary and 84 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-12-06 23:44:21,779 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,779 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,780 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:21,781 : INFO : EPOCH - 1 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:21,784 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,785 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,785 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:21,785 : INFO : EPOCH - 2 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:21,789 : INFO : worker thread finished; awaiting 

2020-12-06 23:44:21,851 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,852 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,853 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:21,854 : INFO : EPOCH - 13 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:21,858 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,859 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,860 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:21,861 : INFO : EPOCH - 14 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:21,867 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,868 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,868 : INFO

2020-12-06 23:44:21,935 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,936 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,937 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:21,937 : INFO : EPOCH - 25 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:21,942 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,943 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,944 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:21,944 : INFO : EPOCH - 26 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:21,950 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:21,951 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:21,951 : INFO

2020-12-06 23:44:22,025 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:22,026 : INFO : EPOCH - 37 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:22,031 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:22,032 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:22,033 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:22,033 : INFO : EPOCH - 38 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s
2020-12-06 23:44:22,038 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-06 23:44:22,039 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-06 23:44:22,039 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-06 23:44:22,040 : INFO : EPOCH - 39 : training on 0 raw words (0 effective words) took 0.0s, 0 effective words/s


In [15]:
def vectorize(paragraph):
    return model.infer_vector(gensim.utils.simple_preprocess(paragraph))

In [16]:
# Add Rank of Each Step in List (starting with zero)
def addRank(lyst):
    ranked = list()
    for index, element in enumerate(lyst):
        ranked.append((element, index))
    return ranked

In [17]:
def inOrder(lyst):
    ordered = list()
    for element in lyst:
        element = list(element)
        element.append(element[0][1] < element[1][1])
        ordered.append(tuple(element))
    return ordered

In [18]:
def vectorizeList(lyst):
    vector_list = list()
    for k in lyst:
        vector_list.append(vectorize(k))
    return vector_list

In [19]:
# Permute list of steps to make all possible pairs of steps
def makePairs(lyst):
    perms = list(permutations(addRank(lyst), 2))
    return inOrder(perms)

In [20]:
def makePairsList(wiki, start: int, end: int):
    pairslist = list()
    for x in range(start, end):
        paraAndVector = list()
        for k in wikihow[key_list[x]]:
            element = list()
            element.append(k)
            element.append(vectorize(k))
            paraAndVector.append(tuple(element))
        pairslist+=makePairs(paraAndVector)
        paraAndVector.clear()
    return pairslist

In [21]:
#paraAndVector = list()
#for k in wikihow[key_list[0]]:
#    element = list()
#    element.append(k)
#    element.append(vectorize(k))
#    paraAndVector.append(tuple(element))

In [22]:
#len(paraAndVector)

In [23]:
#perms = list(permutations(addRank(paraAndVector), 2))
#perms

In [24]:
#makePairsList(wikihow, 0, 1)

In [25]:
# create training and test sets (1000 and 200 articles each)
train_pairs = makePairsList(wikihow, 0, 80)
test_pairs = makePairsList(wikihow, 80, 100)

In [26]:
random.shuffle(train_pairs)
random.shuffle(test_pairs)

In [27]:
def splitXY(lyst):
    x_data = list()
    y_data = list()
    for element in lyst:
        x_data.append(element[:2])
        y_data.append(element[2])
    return (x_data, y_data)
x, y_train = splitXY(train_pairs)
z, y_test = splitXY(test_pairs)

In [28]:
def vectorizePair(lyst):
    data = list()
    for element in lyst:
        data.append(list(element[0][0][1]) + list(element[1][0][1]))
    return np.array(data)
x_train = vectorizePair(x)
x_test = vectorizePair(z)

In [29]:
from sklearn.model_selection import GridSearchCV
param_grid = [
 {'gamma':['scale'],'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [1, 2, 3, 4], 'C':[1, 5, 20, 50, 70, 100]}
]

In [30]:
from sklearn.svm import SVC
test_svm = SVC()

In [31]:
grid_search = GridSearchCV(test_svm, param_grid, cv=5, scoring='accuracy', return_train_score=True)

In [32]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 5, 20, 50, 70, 100], 'degree': [1, 2, 3, 4],
                          'gamma': ['scale'],
                          'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [33]:
grid_search.best_params_

{'C': 5, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}

In [34]:
grid_search.best_estimator_

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=4, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [98]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="poly", degree=5, coef0=1, C=5))
])

In [99]:
poly_kernel_svm_clf.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm_clf',
                 SVC(C=5, cache_size=200, class_weight=None, coef0=1,
                     decision_function_shape='ovr', degree=5,
                     gamma='auto_deprecated', kernel='poly', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [100]:
from sklearn.model_selection import cross_val_predict
y_SVM_pred = cross_val_predict(poly_kernel_svm_clf, x_train, y_train, cv=5)

# confusion matrix for testing
from sklearn.metrics import confusion_matrix 
confusion_matrix(y_train, y_SVM_pred)

array([[2608,  459],
       [ 451, 2616]])

In [101]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train, y_SVM_pred) 

0.8507317073170731

In [102]:
recall_score(y_train, y_SVM_pred)

0.8529507662210629

In [103]:
y_SVM_test = poly_kernel_svm_clf.predict(x_test)

In [104]:
confusion_matrix(y_test, y_SVM_test)

array([[287, 241],
       [241, 287]])

In [105]:
precision_score(y_test, y_SVM_test)

0.5435606060606061

In [106]:
recall_score(y_test, y_SVM_test)

0.5435606060606061