In [49]:
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, pairwise
import nltk
from nltk.tokenize import word_tokenize
import os
import spacy
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from itertools import permutations
nlp = spacy.load('en_core_web_sm')

In [6]:
original = pd.read_csv('../wikihowSep.csv')
original.head()

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [7]:
# drop rows where at least 1 element is missing
df = original.dropna()

In [8]:
#use only subset of data
df = df[:1000]

In [9]:
#remove singleton titles
for title in df['title']:
    if (df.title == title).sum() <= 1:
        df = df[df.title != title]
#print(df['title'].value_counts())

In [10]:
#remove extra spaces, newlines, and incorrect ending punctuation
def cleanText(string) -> str:
    lyst = string.split()
    return " ".join(lyst).strip(";")

In [11]:
def process_instructions(dataframe) -> dict:
    # dictionary of title (article) to text (list of steps)
    wikihow = dict()

    for idx, row in df.iterrows():
        title = row['title']
        text = cleanText(row['headline']) + " " + cleanText(row['text'])
        if title and text:
            if title in wikihow:
                wikihow[title].append(text)
            else:
                wikihow[title] = [text]
    
    return wikihow

In [12]:
# pairs of (instruction, rank)
def get_instruction_rank(database: dict) -> list:
    instruction_rank = list()
    
    for article, instructions in database.items():
        for idx, step in enumerate(instructions, start=1):
            instruction_rank.append((step, idx / len(instructions)))
    
    return instruction_rank

In [13]:
# convert text instructions to a vector with spacy
def convert_vector(instruction_rank: list, num_examples: int) -> list:
    vector_rank = list()
    
    for step in instruction_rank[:num_examples]:
        instruction1, rank1,instruction2,rank2,tf = step[0][0], step[0][1],step[1][0],step[1][1],step[2]
        doc1 = nlp(instruction1)
        doc2 = nlp(instruction2)
        vector_rank.append(([doc1.vector, rank1],[doc2.vector,rank2],tf))
    
    return vector_rank

In [14]:
wikihow = process_instructions(dataframe=df)

In [15]:
sample = wikihow['How to Sell Fine Art Online']
print(sample)

["Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.", 'Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show your artwor

In [16]:
def addRank(lyst):
    ranked = list()
    for index, element in enumerate(lyst):
        ranked.append((element, index))
    return ranked

In [17]:
def inOrder(lyst):
    ordered = list()
    for element in lyst:
        element = list(element)
        element.append(element[0][1] < element[1][1])
        ordered.append(tuple(element))
    return ordered

In [18]:
def makePairs(lyst):
    perms = list(permutations(addRank(lyst), 2))
    return inOrder(perms)

In [19]:
def makePairsList(wiki):
    pairslist = list()
    for k in wiki.keys():
        pairslist += makePairs(wikihow[k])
    return pairslist
pairs = makePairsList(wikihow)

In [20]:
makePairs(sample)

[(("Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
   0),
  ('Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show

In [99]:
wikihow.keys()

dict_keys(['How to Sell Fine Art Online', 'How to Be Well Read', 'How to Pick a Stage Name1', 'How to Get More Burlesque Gigs', 'How to Get a Record Deal With Phantom City Studio', 'How to Buy on Ticketmaster', 'How to Become a Famous Artist', 'How to Reduce Entertainment Expenses', 'How to Find the Nearest Casino1', 'How to Pick a Stage Name2', 'How to Keep Hobby Costs Down', 'How to Conduct a Workshop', 'How to Create Printed Circuit Boards', 'How to Find the Nearest Casino2', 'How to Find the Nearest Casino3', 'How to Find the Nearest Casino4', 'How to Go Adventuring as a Teen', 'How to Cope with a Bedridden Parent', 'How to Be Healthy', 'How to Check Your Pulse', 'How to Pick a Stage Name3', 'How to Find the Nearest Casino5', 'How to Pick a Stage Name4', 'How to Ask God for Something', 'How to Deprogram a Religious Cult Member1', 'How to Explore & Understand Value or Quality in Life', 'How to Contrast Evolution and Intelligent Design From a Creationist Perspective', 'How to Create 

In [18]:
# number of articles
len(df)

993

In [19]:
instruction_rank = get_instruction_rank(database=wikihow)

In [20]:
pairs[0]

(("Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
  0),
 ('Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show yo

In [21]:
# work with 100 instructions to start off; make sure all steps are included per article
vector_rank = convert_vector(pairs, num_examples=100)

In [22]:
print(vector_rank)

[([array([ 6.21317625e-02,  3.13543648e-01,  1.74683422e-01, -1.08683109e+00,
        2.51413912e-01, -6.09057903e-01, -3.15163374e-01,  6.49498284e-01,
       -7.59598434e-01,  7.70982444e-01, -1.35766596e-01, -1.88416585e-01,
        2.40615189e-01,  7.73873329e-01,  3.28081660e-02,  4.95044231e-01,
       -9.67470929e-02, -7.72449493e-01,  2.56166846e-01,  6.20652914e-01,
        1.50883794e-01,  1.77884884e-02,  4.63024974e-01, -4.34872180e-01,
        6.05799377e-01,  8.51967275e-01, -6.08667314e-01, -7.58104503e-01,
       -1.20187199e+00, -1.95022821e-01,  1.81690782e-01, -5.88542938e-01,
        2.31737569e-01, -3.38517755e-01, -7.53019035e-01, -7.53962040e-01,
       -4.85269666e-01, -8.75474012e-04, -2.18475044e-01, -2.85865456e-01,
        1.27605903e+00,  2.85630733e-01,  5.35635889e-01, -7.21778154e-01,
        9.47504103e-01, -5.68873405e-01, -2.27390558e-01,  7.83406317e-01,
        3.34879696e-01, -2.42223531e-01, -7.25738347e-01, -1.34587914e-01,
       -8.25055122e-01

In [125]:
# reads and preprocesses text (tokenize text into words, remove punctuation, lowercase, etc) for gensim doc2vec model
# lyst is list of each paragraph/step per article (like sample)
# corpus = collection of documents (in this case collection of instruction paragraphs)
def read_corpus(lyst, tokens_only=False):
    for i, line in enumerate(lyst):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list()
for key in wikihow.keys():
    train_corpus += read_corpus(wikihow[key])
#TODO: replace sample with list of all paragraphs in training set
# always set tokens_only=True for test corpus:
#test_corpus = list(read_corpus(TEST_FILE, tokens_only=True)) #TODO: uncomment and replace TEST_FILE with test set 

In [145]:
# training the doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=2, epochs=40) # not sure about what the vector size should be and other parameters

model.build_vocab(train_corpus)

2020-11-17 15:39:29,427 : INFO : collecting all words and their counts
2020-11-17 15:39:29,436 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-11-17 15:39:29,470 : INFO : collected 8533 word types and 36 unique tags from a corpus of 993 examples and 90210 words
2020-11-17 15:39:29,472 : INFO : Loading a fresh vocabulary
2020-11-17 15:39:29,483 : INFO : effective_min_count=2 retains 4661 unique words (54% of original 8533, drops 3872)
2020-11-17 15:39:29,484 : INFO : effective_min_count=2 leaves 86338 word corpus (95% of original 90210, drops 3872)
2020-11-17 15:39:29,523 : INFO : deleting the raw counts dictionary of 8533 items
2020-11-17 15:39:29,524 : INFO : sample=0.001 downsamples 46 most-common words
2020-11-17 15:39:29,524 : INFO : downsampling leaves estimated 64695 word corpus (74.9% of prior 86338)
2020-11-17 15:39:29,620 : INFO : estimated required memory for 4661 words and 20 dimensions: 3079140 bytes
2020-11-17 15:39:29,622 : INFO : rese

In [146]:
#print(f"Word 'artist' appeared {model.wv.get_vecattr('artist', 'count')} times in the training corpus.")

In [147]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) # train model on train_corpus

2020-11-17 15:39:31,555 : INFO : training model with 3 workers on 4661 vocabulary and 20 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-11-17 15:39:31,944 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:39:31,978 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:39:32,017 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:39:32,019 : INFO : EPOCH - 1 : training on 90210 raw words (65794 effective words) took 0.4s, 148581 effective words/s
2020-11-17 15:39:32,274 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:39:32,313 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:39:32,333 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:39:32,334 : INFO : EPOCH - 2 : training on 90210 raw words (65689 effective words) took 0.3s, 220341 effective words/s
2020-11-17 15:39:32,642 : INFO : worker 

2020-11-17 15:39:38,540 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:39:38,601 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:39:38,637 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:39:38,638 : INFO : EPOCH - 21 : training on 90210 raw words (65541 effective words) took 0.3s, 197925 effective words/s
2020-11-17 15:39:38,892 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:39:38,904 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 15:39:38,925 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 15:39:38,926 : INFO : EPOCH - 22 : training on 90210 raw words (65730 effective words) took 0.3s, 238357 effective words/s
2020-11-17 15:39:39,151 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 15:39:39,159 : INFO : worker thread finished; awaiting finish of 1 more threads
2020

In [148]:
tokens = gensim.utils.simple_preprocess(sample[0]) #tokens for first step in 'How to Sell Fine Art Online'
tokens

['sell',
 'yourself',
 'first',
 'before',
 'doing',
 'anything',
 'else',
 'stop',
 'and',
 'sum',
 'up',
 'yourself',
 'as',
 'an',
 'artist',
 'now',
 'think',
 'about',
 'how',
 'to',
 'translate',
 'that',
 'to',
 'an',
 'online',
 'profile',
 'be',
 'it',
 'the',
 'few',
 'words',
 'twitter',
 'allows',
 'you',
 'or',
 'an',
 'entire',
 'page',
 'of',
 'indulgence',
 'that',
 'your',
 'own',
 'website',
 'would',
 'allow',
 'you',
 'bring',
 'out',
 'the',
 'most',
 'salient',
 'features',
 'of',
 'your',
 'creativity',
 'your',
 'experience',
 'your',
 'passion',
 'and',
 'your',
 'reasons',
 'for',
 'painting',
 'make',
 'it',
 'clear',
 'to',
 'readers',
 'why',
 'you',
 'are',
 'an',
 'artist',
 'who',
 'loves',
 'art',
 'produces',
 'high',
 'quality',
 'art',
 'and',
 'is',
 'true',
 'champion',
 'of',
 'art',
 'if',
 'you',
 're',
 'not',
 'great',
 'with',
 'words',
 'find',
 'friend',
 'who',
 'can',
 'help',
 'you',
 'with',
 'this',
 'really',
 'important',
 'aspect',


In [149]:
# can use the trained model to infer a vector for any piece of text 
# by passing a list of words to the model.infer_vector function
vector = model.infer_vector(tokens)
print(vector)

[-0.0556741  -1.9647647  -0.07015645  1.714857    1.0003476  -2.577794
 -1.6791416   0.99701554 -2.927988    0.55396247  0.84617954  0.72833776
  2.376785    0.64434993  1.7381406  -1.0681496  -0.5911054  -1.8767139
 -2.603698   -1.7826048 ]


In [150]:
x = model.infer_vector(["photos"])

In [151]:
y = model.infer_vector(["photography"])

In [152]:
from scipy import spatial
result = 1 - spatial.distance.cosine(x, y)
print(result)

0.7660799026489258


In [22]:
# TODO: assess model