In [57]:
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, pairwise
import nltk
from nltk.tokenize import word_tokenize
import os
import spacy
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from itertools import permutations
nlp = spacy.load('en_core_web_sm')

In [58]:
original = pd.read_csv('~/Downloads/wikihowSep.csv')
original.head()

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [59]:
# drop rows where at least 1 element is missing
df = original.dropna()

In [60]:
#use only subset of data
df = df[:1000]

In [61]:
#remove singleton titles
for title in df['title']:
    if (df.title == title).sum() <= 1:
        df = df[df.title != title]
#print(df['title'].value_counts())

In [62]:
#remove extra spaces, newlines, and incorrect ending punctuation
def cleanText(string) -> str:
    lyst = string.split()
    return " ".join(lyst).strip(";")

In [63]:
def process_instructions(dataframe) -> dict:
    # dictionary of title (article) to text (list of steps)
    wikihow = dict()

    for idx, row in df.iterrows():
        title = row['title']
        text = cleanText(row['headline']) + " " + cleanText(row['text'])
        if title and text:
            if title in wikihow:
                wikihow[title].append(text)
            else:
                wikihow[title] = [text]
    
    return wikihow

In [64]:
# pairs of (instruction, rank)
def get_instruction_rank(database: dict) -> list:
    instruction_rank = list()
    
    for article, instructions in database.items():
        for idx, step in enumerate(instructions, start=1):
            instruction_rank.append((step, idx / len(instructions)))
    
    return instruction_rank

In [65]:
# convert text instructions to a vector with spacy
def convert_vector(instruction_rank: list, num_examples: int) -> list:
    vector_rank = list()
    
    for step in instruction_rank[:num_examples]:
        instruction1, rank1,instruction2,rank2,tf = step[0][0], step[0][1],step[1][0],step[1][1],step[2]
        doc1 = model.infer_vector(gensim.utils.simple_preprocess(instruction1))
        doc2 = model.infer_vector(gensim.utils.simple_preprocess(instruction2))
        vector_rank.append((doc1,doc2,tf))
    
    return vector_rank

In [75]:
wikihow = process_instructions(dataframe=df)

In [77]:
sample = wikihow['How to Sell Fine Art Online']

In [78]:
def addRank(lyst):
    ranked = list()
    for index, element in enumerate(lyst):
        ranked.append((element, index))
    return ranked

In [79]:
def inOrder(lyst):
    ordered = list()
    for element in lyst:
        element = list(element)
        element.append(element[0][1] < element[1][1])
        ordered.append(tuple(element))
    return ordered

In [80]:
def makePairs(lyst):
    perms = list(permutations(addRank(lyst), 2))
    return inOrder(perms)

In [81]:
def makePairsList(wiki):
    pairslist = list()
    for k in wiki.keys():
        pairslist += makePairs(wikihow[k])
    return pairslist
pairs = makePairsList(wikihow)

In [82]:
makePairs(sample)

[(("Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
   0),
  ('Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show

In [83]:
# number of articles
len(df)

993

In [84]:
instruction_rank = get_instruction_rank(database=wikihow)

In [85]:
pairs[0]

(("Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
  0),
 ('Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show yo

In [86]:
# reads and preprocesses text (tokenize text into words, remove punctuation, lowercase, etc) for gensim doc2vec model
# lyst is list of each paragraph/step per article (like sample)
# corpus = collection of documents (in this case collection of instruction paragraphs)
def read_corpus(lyst, tokens_only=False):
    for i, line in enumerate(lyst):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(df['text']))
#TODO: replace sample with list of all paragraphs in training set
# always set tokens_only=True for test corpus:
#test_corpus = list(read_corpus(TEST_FILE, tokens_only=True)) #TODO: uncomment and replace TEST_FILE with test set 

In [87]:
# training the doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=84, min_count=2, epochs=40) # not sure about what the vector size should be and other parameters

model.build_vocab(train_corpus)

2020-11-17 16:36:45,707 : INFO : collecting all words and their counts
2020-11-17 16:36:45,710 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-11-17 16:36:45,763 : INFO : collected 8324 word types and 993 unique tags from a corpus of 993 examples and 83402 words
2020-11-17 16:36:45,764 : INFO : Loading a fresh vocabulary
2020-11-17 16:36:45,794 : INFO : effective_min_count=2 retains 4471 unique words (53% of original 8324, drops 3853)
2020-11-17 16:36:45,796 : INFO : effective_min_count=2 leaves 79549 word corpus (95% of original 83402, drops 3853)
2020-11-17 16:36:45,838 : INFO : deleting the raw counts dictionary of 8324 items
2020-11-17 16:36:45,843 : INFO : sample=0.001 downsamples 46 most-common words
2020-11-17 16:36:45,846 : INFO : downsampling leaves estimated 59172 word corpus (74.4% of prior 79549)
2020-11-17 16:36:45,874 : INFO : estimated required memory for 4471 words and 84 dimensions: 5573660 bytes
2020-11-17 16:36:45,875 : INFO : res

In [40]:
#print(f"Word 'artist' appeared {model.wv.get_vecattr('artist', 'count')} times in the training corpus.")

In [88]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) # train model on train_corpus

2020-11-17 16:36:47,873 : INFO : training model with 3 workers on 4471 vocabulary and 84 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-11-17 16:36:48,238 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 16:36:48,287 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 16:36:48,290 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 16:36:48,300 : INFO : EPOCH - 1 : training on 83402 raw words (60233 effective words) took 0.4s, 143612 effective words/s
2020-11-17 16:36:48,651 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 16:36:48,713 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 16:36:48,724 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 16:36:48,727 : INFO : EPOCH - 2 : training on 83402 raw words (60231 effective words) took 0.4s, 147500 effective words/s
2020-11-17 16:36:49,086 : INFO : worker 

2020-11-17 16:36:54,960 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 16:36:54,996 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 16:36:55,004 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 16:36:55,009 : INFO : EPOCH - 21 : training on 83402 raw words (60093 effective words) took 0.3s, 238438 effective words/s
2020-11-17 16:36:55,202 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 16:36:55,235 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 16:36:55,242 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 16:36:55,244 : INFO : EPOCH - 22 : training on 83402 raw words (60149 effective words) took 0.2s, 265425 effective words/s
2020-11-17 16:36:55,452 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 16:36:55,479 : INFO : worker thread finished; awaiting finish of 1 more threads
2020

In [89]:
tokens = gensim.utils.simple_preprocess(sample[0]) #tokens for first step in 'How to Sell Fine Art Online'
tokens

['sell',
 'yourself',
 'first',
 'before',
 'doing',
 'anything',
 'else',
 'stop',
 'and',
 'sum',
 'up',
 'yourself',
 'as',
 'an',
 'artist',
 'now',
 'think',
 'about',
 'how',
 'to',
 'translate',
 'that',
 'to',
 'an',
 'online',
 'profile',
 'be',
 'it',
 'the',
 'few',
 'words',
 'twitter',
 'allows',
 'you',
 'or',
 'an',
 'entire',
 'page',
 'of',
 'indulgence',
 'that',
 'your',
 'own',
 'website',
 'would',
 'allow',
 'you',
 'bring',
 'out',
 'the',
 'most',
 'salient',
 'features',
 'of',
 'your',
 'creativity',
 'your',
 'experience',
 'your',
 'passion',
 'and',
 'your',
 'reasons',
 'for',
 'painting',
 'make',
 'it',
 'clear',
 'to',
 'readers',
 'why',
 'you',
 'are',
 'an',
 'artist',
 'who',
 'loves',
 'art',
 'produces',
 'high',
 'quality',
 'art',
 'and',
 'is',
 'true',
 'champion',
 'of',
 'art',
 'if',
 'you',
 're',
 'not',
 'great',
 'with',
 'words',
 'find',
 'friend',
 'who',
 'can',
 'help',
 'you',
 'with',
 'this',
 'really',
 'important',
 'aspect',


In [90]:
# can use the trained model to infer a vector for any piece of text 
# by passing a list of words to the model.infer_vector function
vector = model.infer_vector(tokens)
print(vector)

[ 0.27085555  0.11990507  0.02063408 -0.21096985  1.0203168   0.52866864
 -0.14502697  0.1832451  -0.6858422  -0.6690007   0.18705042 -0.8470008
  0.3876491  -0.6896757   0.9221799  -0.4202704   0.13080098 -0.307722
 -0.7279447  -0.60563594 -1.0584785  -0.36325434  0.24150987  0.5087966
  0.1837657  -0.08486474 -0.71944666 -0.5004343  -0.486395   -1.7493131
  0.82491934  0.7183984  -0.80040956  0.09640671 -0.7734688  -0.9012635
  0.0877865   0.09786319 -0.5175517   1.4016423   0.76108503 -0.98979944
  0.46812856 -0.0139284   0.02723021 -0.43255332  0.67788935 -0.53201693
 -1.454294    0.78953356  1.3414856  -0.5116618   0.84442466 -0.00792488
 -0.5265121  -0.6662904  -0.7367144   0.09307131 -0.23984458 -0.03726003
  0.47100154 -0.38545695 -0.34598646 -0.5094018   0.47421443  0.34712806
  0.30094692  0.77383596 -0.5099121  -0.8804944  -0.83237547  0.32196715
 -0.04865706 -0.5498563   0.04667982 -0.16780469 -0.28642428 -1.1808517
  0.7468006  -0.49582145  0.1186481  -1.2218549  -0.441510

In [91]:
x = model.infer_vector(["photos"])

In [92]:
y = model.infer_vector(["photography"])

In [93]:
from scipy import spatial
result = 1 - spatial.distance.cosine(x, y)
print(result)

0.9033425450325012


In [94]:
vector_rank = convert_vector(pairs, num_examples=100)
# TODO: assess model

In [95]:
vector_rank[0]

(array([ 0.30812594,  0.19699627, -0.32317978, -0.10177067,  1.0547007 ,
         0.6501607 , -0.18902086,  0.17270344, -0.75753796, -0.7282611 ,
         0.18172285, -0.83313715,  0.43329537, -0.70596766,  0.6443145 ,
        -0.33592606,  0.28127527, -0.25528628, -0.67728376, -0.8785262 ,
        -1.0470879 , -0.396686  ,  0.40348977,  0.65443856,  0.27954477,
        -0.15498677, -0.81001544, -0.61224246, -0.4871311 , -1.8628862 ,
         0.7741014 ,  0.79592395, -0.7173785 ,  0.07841493, -0.9440135 ,
        -0.82249725,  0.12693682,  0.10812056, -0.45881158,  1.2015448 ,
         0.58746725, -1.0713807 ,  0.5480807 , -0.00870604,  0.02906814,
        -0.39805743,  0.64748186, -0.5186005 , -1.2739851 ,  0.6364793 ,
         1.312402  , -0.5899753 ,  0.7264054 ,  0.00308374, -0.42177278,
        -0.7308415 , -0.65236205,  0.11402167, -0.22693917, -0.23360004,
         0.45113   , -0.22918476, -0.32454726, -0.5115528 ,  0.31530008,
         0.42392015,  0.2509518 ,  0.72385925, -0.3