In [4]:
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import nltk
from nltk.tokenize import word_tokenize
import os
import spacy
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from itertools import permutations
nlp = spacy.load('en_core_web_sm')

In [5]:
original = pd.read_csv('~/Downloads/wikihowSep.csv')
original.head()

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [6]:
# drop rows where at least 1 element is missing
df = original.dropna()

In [7]:
#use only subset of data
df = df[:1000]


In [8]:
#remove singleton titles
for title in df['title']:
    if (df.title == title).sum() <= 1:
        df = df[df.title != title]
#print(df['title'].value_counts())

In [34]:
#remove extra spaces, newlines, and incorrect ending punctuation
def cleanText(string) -> str:
    lyst = string.split()
    return " ".join(lyst).strip(";")

In [35]:
def process_instructions(dataframe) -> dict:
    # dictionary of title (article) to text (list of steps)
    wikihow = dict()

    for idx, row in df.iterrows():
        title = row['title']
        text = cleanText(row['headline']) + " " + cleanText(row['text'])
        if title and text:
            if title in wikihow:
                wikihow[title].append(text)
            else:
                wikihow[title] = [text]
    
    return wikihow

In [36]:
# pairs of (instruction, rank)
def get_instruction_rank(database: dict) -> list:
    instruction_rank = list()
    
    for article, instructions in database.items():
        for idx, step in enumerate(instructions, start=1):
            instruction_rank.append((step, idx / len(instructions)))
    
    return instruction_rank

In [61]:
# reads and preprocesses text (tokenize text into words, remove punctuation, lowercase, etc) for gensim doc2vec model
# lyst is list of each paragraph/step per article (like sample)
# corpus = collection of documents (in this case collection of instruction paragraphs)
def read_corpus(lyst, tokens_only=False):
    for i, line in enumerate(lyst):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(df['text'])) #TODO: replace sample with list of all paragraphs in training set
# always set tokens_only=True for test corpus:
#test_corpus = list(read_corpus(TEST_FILE, tokens_only=True)) #TODO: uncomment and replace TEST_FILE with test set 

In [62]:
numitems = 0
totallen = 0
for item in train_corpus:
    totallen += len(item[0])
    numitems += 1
mean = totallen/numitems
print(mean)


83.98992950654582


In [63]:
# training the doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=84, min_count=2, epochs=40) # not sure about what the vector size should be and other parameters

model.build_vocab(train_corpus)

2020-11-17 15:37:24,648 : INFO : collecting all words and their counts
2020-11-17 15:37:24,648 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-11-17 15:37:24,686 : INFO : collected 8324 word types and 993 unique tags from a corpus of 993 examples and 83402 words
2020-11-17 15:37:24,686 : INFO : Loading a fresh vocabulary
2020-11-17 15:37:24,701 : INFO : effective_min_count=2 retains 4471 unique words (53% of original 8324, drops 3853)
2020-11-17 15:37:24,701 : INFO : effective_min_count=2 leaves 79549 word corpus (95% of original 83402, drops 3853)
2020-11-17 15:37:24,732 : INFO : deleting the raw counts dictionary of 8324 items
2020-11-17 15:37:24,732 : INFO : sample=0.001 downsamples 46 most-common words
2020-11-17 15:37:24,732 : INFO : downsampling leaves estimated 59172 word corpus (74.4% of prior 79549)
2020-11-17 15:37:24,764 : INFO : estimated required memory for 4471 words and 84 dimensions: 5573660 bytes
2020-11-17 15:37:24,772 : INFO : res

In [25]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) # train model on train_corpus

2020-11-17 15:26:34,182 : INFO : training model with 3 workers on 4471 vocabulary and 84 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-11-17 15:26:34,314 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:26:34,314 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:26:34,330 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:26:34,330 : INFO : EPOCH - 1 : training on 83402 raw words (60060 effective words) took 0.1s, 475220 effective words/s
2020-11-17 15:26:34,416 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:26:34,432 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:26:34,432 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:26:34,432 : INFO : EPOCH - 2 : training on 83402 raw words (60045 effective words) took 0.1s, 575023 effective words/s
2020-11-17 15:26:34,530 : INFO : worker 

2020-11-17 15:26:36,472 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:26:36,488 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:26:36,488 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:26:36,488 : INFO : EPOCH - 21 : training on 83402 raw words (60195 effective words) took 0.1s, 587421 effective words/s
2020-11-17 15:26:36,588 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:26:36,604 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:26:36,604 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:26:36,604 : INFO : EPOCH - 22 : training on 83402 raw words (60132 effective words) took 0.1s, 554223 effective words/s
2020-11-17 15:26:36,704 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:26:36,704 : INFO : worker thread finished; awaiting finish of 1 more threads
2020

In [51]:
tokens = gensim.utils.simple_preprocess(sample[0]) #tokens for first step in 'How to Sell Fine Art Online'
tokens

['sell',
 'yourself',
 'first',
 'before',
 'doing',
 'anything',
 'else',
 'stop',
 'and',
 'sum',
 'up',
 'yourself',
 'as',
 'an',
 'artist',
 'now',
 'think',
 'about',
 'how',
 'to',
 'translate',
 'that',
 'to',
 'an',
 'online',
 'profile',
 'be',
 'it',
 'the',
 'few',
 'words',
 'twitter',
 'allows',
 'you',
 'or',
 'an',
 'entire',
 'page',
 'of',
 'indulgence',
 'that',
 'your',
 'own',
 'website',
 'would',
 'allow',
 'you',
 'bring',
 'out',
 'the',
 'most',
 'salient',
 'features',
 'of',
 'your',
 'creativity',
 'your',
 'experience',
 'your',
 'passion',
 'and',
 'your',
 'reasons',
 'for',
 'painting',
 'make',
 'it',
 'clear',
 'to',
 'readers',
 'why',
 'you',
 'are',
 'an',
 'artist',
 'who',
 'loves',
 'art',
 'produces',
 'high',
 'quality',
 'art',
 'and',
 'is',
 'true',
 'champion',
 'of',
 'art',
 'if',
 'you',
 're',
 'not',
 'great',
 'with',
 'words',
 'find',
 'friend',
 'who',
 'can',
 'help',
 'you',
 'with',
 'this',
 'really',
 'important',
 'aspect',


In [52]:
# can use the trained model to infer a vector for any piece of text 
# by passing a list of words to the model.infer_vector function
vector = model.infer_vector(tokens)
print(vector)

[-3.1433357e-03 -4.7073881e-03  4.6208026e-03 -7.2790870e-05
 -2.9136429e-03  2.0583298e-04  5.6959023e-03 -2.5576109e-03
  3.5939838e-03 -2.4427287e-03 -2.1749663e-03 -5.8558472e-03
 -5.0701238e-03 -3.1627819e-03  2.8593242e-03 -4.8847045e-03
 -3.9044921e-03 -1.4178110e-03 -1.9879292e-03 -2.1801342e-03
  6.0771385e-05  4.6532359e-03  4.3920637e-03  5.3771087e-03
 -3.8209963e-03  3.1151304e-03  7.3473441e-04 -3.2867158e-03
 -1.0628357e-03 -5.3967834e-03  3.4190123e-03 -4.7679655e-03
  1.7299142e-03  4.0994951e-04 -3.9944854e-03  3.6315215e-03
  9.4086502e-04  5.3501963e-03  2.6071265e-03  1.3269857e-03
 -4.3390216e-03  3.3936820e-03 -3.3623541e-03  1.5060682e-03
  4.8528854e-03  2.1306989e-03  2.9644335e-03  3.3837031e-03
 -3.8263828e-03  1.5618664e-03 -1.4599862e-03 -5.8795069e-03
  1.0732017e-03  3.9344672e-03 -2.2407770e-03 -3.4176833e-03
  4.0157018e-03  1.4611103e-03  4.6137106e-03  3.1327084e-03
 -3.8796896e-03  1.1392038e-03  4.2996235e-04 -2.2394524e-03
 -2.3123645e-03 -2.21495

In [67]:
# TODO: assess model
x = model.infer_vector(['photos'])
y = model.infer_vector(['photography'])
from scipy import spatial
result =  spatial.distance.cosine(x,y)
print(result)

1.0956535264849663


In [83]:
# convert text instructions to a vector with spacy
def convert_vector(instruction_rank: list, num_examples: int) -> list:
    vector_rank = list()
    
    for step in instruction_rank[:num_examples]:
        instruction1, rank1,instruction2,rank2,tf = step[0][0], step[0][1],step[1][0],step[1][1],step[2]
        doc1 = model.infer_vector(gensim.utils.simple_preprocess(instruction1))
        doc2 = model.infer_vector(gensim.utils.simple_preprocess(instruction2))
        vector_rank.append((doc1,doc2,tf))
    
    return vector_rank

In [69]:
wikihow = process_instructions(dataframe=df)

In [70]:
sample = wikihow['How to Sell Fine Art Online']

In [71]:
def addRank(lyst):
    ranked = list()
    for index, element in enumerate(lyst):
        ranked.append((element, index))
    return ranked

In [72]:
def inOrder(lyst):
    ordered = list()
    for element in lyst:
        element = list(element)
        element.append(element[0][1] < element[1][1])
        ordered.append(tuple(element))
    return ordered

In [73]:
def makePairs(lyst):
    perms = list(permutations(addRank(lyst), 2))
    return inOrder(perms)

In [74]:
def makePairsList(wiki):
    pairslist = list()
    for k in wiki.keys():
        pairslist += makePairs(wikihow[k])
    return pairslist
pairs = makePairsList(wikihow)

In [75]:
# number of articles
len(df)

993

In [76]:
instruction_rank = get_instruction_rank(database=wikihow)

In [86]:
# work with 100 instructions to start off; make sure all steps are included per article
vector_rank = convert_vector(pairs, num_examples=1000)

[(array([-3.1433357e-03, -4.7073881e-03,  4.6208026e-03, -7.2790870e-05,
         -2.9136429e-03,  2.0583298e-04,  5.6959023e-03, -2.5576109e-03,
          3.5939838e-03, -2.4427287e-03, -2.1749663e-03, -5.8558472e-03,
         -5.0701238e-03, -3.1627819e-03,  2.8593242e-03, -4.8847045e-03,
         -3.9044921e-03, -1.4178110e-03, -1.9879292e-03, -2.1801342e-03,
          6.0771385e-05,  4.6532359e-03,  4.3920637e-03,  5.3771087e-03,
         -3.8209963e-03,  3.1151304e-03,  7.3473441e-04, -3.2867158e-03,
         -1.0628357e-03, -5.3967834e-03,  3.4190123e-03, -4.7679655e-03,
          1.7299142e-03,  4.0994951e-04, -3.9944854e-03,  3.6315215e-03,
          9.4086502e-04,  5.3501963e-03,  2.6071265e-03,  1.3269857e-03,
         -4.3390216e-03,  3.3936820e-03, -3.3623541e-03,  1.5060682e-03,
          4.8528854e-03,  2.1306989e-03,  2.9644335e-03,  3.3837031e-03,
         -3.8263828e-03,  1.5618664e-03, -1.4599862e-03, -5.8795069e-03,
          1.0732017e-03,  3.9344672e-03, -2.2407770

In [25]:
#print(f"Word 'artist' appeared {model.wv.get_vecattr('artist', 'count')} times in the training corpus.")