In [8]:
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, pairwise
import nltk
from nltk.tokenize import word_tokenize
import os
import spacy
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from itertools import permutations
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from scipy import spatial
import random
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load('en_core_web_sm')

AttributeError: module 'tensorflow_core.keras.activations' has no attribute 'swish'

In [274]:
original = pd.read_csv('~/Downloads/wikihowSep.csv')
original.head()

KeyboardInterrupt: 

In [242]:
# drop rows where at least 1 element is missing
df = original.dropna()
#use only subset of data
df = df[:1000]
test_df = df[1000:1200]

In [243]:
#remove singleton titles
for title in df['title']:
    if (df.title == title).sum() <= 1:
        df = df[df.title != title]
#print(df['title'].value_counts())

In [244]:
#remove extra spaces, newlines, and incorrect ending punctuation
def cleanText(string) -> str:
    lyst = string.split()
    return " ".join(lyst).strip(";")

In [245]:
def process_instructions(dataframe) -> dict:
    # dictionary of title (article) to text (list of steps)
    wikihow = dict()

    for idx, row in df.iterrows():
        title = row['title']
        text = cleanText(row['headline']) + " " + cleanText(row['text'])
        if title and text:
            if title in wikihow:
                wikihow[title].append(text)
            else:
                wikihow[title] = [text]
    
    return wikihow

In [246]:
# pairs of (instruction, rank)
def get_instruction_rank(database: dict) -> list:
    instruction_rank = list()
    
    for article, instructions in database.items():
        for idx, step in enumerate(instructions, start=1):
            instruction_rank.append((step, idx / len(instructions)))
    
    return instruction_rank

In [247]:
# convert text instructions to a vector with spacy
def convert_vector(instruction_rank: list, num_examples: int) -> list:
    vector_rank = list()
    
    for step in instruction_rank[:num_examples]:
        instruction1, rank1,instruction2,rank2,tf = step[0][0], step[0][1],step[1][0],step[1][1],step[2]
        doc1 = model.infer_vector(gensim.utils.simple_preprocess(instruction1))
        doc2 = model.infer_vector(gensim.utils.simple_preprocess(instruction2))
        vector_rank.append((doc1,doc2,tf))
    
    return vector_rank

In [248]:
wikihow = process_instructions(dataframe=df)
processed_test = process_instructions(dataframe=df)
# number of articles
len(df)
sample = wikihow['How to Sell Fine Art Online']

In [249]:
def addRank(lyst):
    ranked = list()
    for index, element in enumerate(lyst):
        ranked.append((element, index))
    return ranked

In [250]:
def inOrder(lyst):
    ordered = list()
    for element in lyst:
        element = list(element)
        element.append(element[0][1] < element[1][1])
        ordered.append(tuple(element))
    return ordered

In [251]:
def makePairs(lyst):
    perms = list(permutations(addRank(lyst), 2))
    return inOrder(perms)

In [263]:
def makePairsList(wiki):
    pairslist = list()
    for k in wiki.keys():
        pairslist += makePairs(wikihow[k])
    return pairslist
pairs = makePairsList(wikihow)
test_pairs = makePairsList(processed_test)

In [226]:
makePairs(sample)

[(("Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
   0),
  ('Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show

In [253]:
# reads and preprocesses text (tokenize text into words, remove punctuation, lowercase, etc) for gensim doc2vec model
# lyst is list of each paragraph/step per article (like sample)
# corpus = collection of documents (in this case collection of instruction paragraphs)
def read_corpus(lyst, tokens_only=False):
    for i, line in enumerate(lyst):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(df['text']))
#TODO: replace sample with list of all paragraphs in training set
# always set tokens_only=True for test corpus:
#test_corpus = list(read_corpus(TEST_FILE, tokens_only=True)) #TODO: uncomment and replace TEST_FILE with test set 

In [254]:
# training the doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=84, min_count=2, epochs=40) # not sure about what the vector size should be and other parameters

model.build_vocab(train_corpus)

2020-11-24 17:22:26,712 : INFO : collecting all words and their counts
2020-11-24 17:22:26,724 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-11-24 17:22:26,788 : INFO : collected 8324 word types and 993 unique tags from a corpus of 993 examples and 83402 words
2020-11-24 17:22:26,789 : INFO : Loading a fresh vocabulary
2020-11-24 17:22:26,821 : INFO : effective_min_count=2 retains 4471 unique words (53% of original 8324, drops 3853)
2020-11-24 17:22:26,822 : INFO : effective_min_count=2 leaves 79549 word corpus (95% of original 83402, drops 3853)
2020-11-24 17:22:26,910 : INFO : deleting the raw counts dictionary of 8324 items
2020-11-24 17:22:26,913 : INFO : sample=0.001 downsamples 46 most-common words
2020-11-24 17:22:26,946 : INFO : downsampling leaves estimated 59172 word corpus (74.4% of prior 79549)
2020-11-24 17:22:26,982 : INFO : estimated required memory for 4471 words and 84 dimensions: 5573660 bytes
2020-11-24 17:22:26,987 : INFO : res

In [255]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) # train model on train_corpus

2020-11-24 17:22:29,143 : INFO : training model with 3 workers on 4471 vocabulary and 84 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-11-24 17:22:29,391 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-24 17:22:29,405 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-24 17:22:29,416 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-24 17:22:29,425 : INFO : EPOCH - 1 : training on 83402 raw words (60091 effective words) took 0.3s, 236045 effective words/s
2020-11-24 17:22:29,621 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-24 17:22:29,648 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-24 17:22:29,675 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-24 17:22:29,676 : INFO : EPOCH - 2 : training on 83402 raw words (60093 effective words) took 0.2s, 254928 effective words/s
2020-11-24 17:22:29,910 : INFO : worker 

2020-11-24 17:22:34,732 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-24 17:22:34,751 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-24 17:22:34,775 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-24 17:22:34,777 : INFO : EPOCH - 21 : training on 83402 raw words (60076 effective words) took 0.3s, 205706 effective words/s
2020-11-24 17:22:35,069 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-24 17:22:35,099 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-24 17:22:35,102 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-24 17:22:35,103 : INFO : EPOCH - 22 : training on 83402 raw words (60146 effective words) took 0.3s, 189180 effective words/s
2020-11-24 17:22:35,340 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-24 17:22:35,348 : INFO : worker thread finished; awaiting finish of 1 more threads
2020

In [257]:
tokens = gensim.utils.simple_preprocess(sample[0]) #tokens for first step in 'How to Sell Fine Art Online'
#tokens

In [258]:
# can use the trained model to infer a vector for any piece of text 
# by passing a list of words to the model.infer_vector function
vector = model.infer_vector(tokens)
#print(vector)

In [259]:
x = model.infer_vector(["photos"])
y = model.infer_vector(["photography"])
result = 1 - spatial.distance.cosine(x, y)
print(result)

0.9140791296958923


In [260]:
vector_rank = convert_vector(pairs, num_examples=100)
#vector_rank[0]
# TODO: assess model

In [265]:
random.shuffle(pairs)
def splitXY(lyst):
    x_data = list()
    y_data = list()
    for element in lyst:
        x_data.append(element[:2])
        y_data.append(element[2])
    return (x_data, y_data)
x, y_train = splitXY(pairs)
z, y_test = splitXY(test_pairs)

def vectorize(paragraph):
    return model.infer_vector(gensim.utils.simple_preprocess(paragraph))

def vectorizePair(lyst):
    data = list()
    for element in lyst:
        data.append(list(vectorize(element[0][0])) + list(vectorize(element[1][0])))
    return np.array(data)
x_train = vectorizePair(x)
x_test = vectorizePair(z)
#print(vectorizePair(x[:1]))

In [266]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(x_train, y_train) 
y_pred_rf = rnd_clf.predict(x_test)

In [267]:
y_pred_rf

array([False, False, False, ..., False,  True, False])

In [268]:
confusion_matrix(y_test, y_pred_rf)

array([[3378, 1394],
       [1294, 3478]])

In [270]:
len(x_test)

9544

In [271]:
len(y_test)

9544

In [272]:
len(x_train)

9544