In [1]:
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import nltk
from nltk.tokenize import word_tokenize
import os
import spacy
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from itertools import permutations
nlp = spacy.load('en_core_web_sm')

In [2]:
original = pd.read_csv('../wikihowSep.csv')
original.head()

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [3]:
# drop rows where at least 1 element is missing
df = original.dropna()

In [4]:
#use only subset of data
df = df[:1000]

In [5]:
#remove singleton titles
for title in df['title']:
    if (df.title == title).sum() <= 1:
        df = df[df.title != title]
#print(df['title'].value_counts())

In [6]:
#remove extra spaces, newlines, and incorrect ending punctuation
def cleanText(string) -> str:
    lyst = string.split()
    return " ".join(lyst).strip(";")

In [7]:
def process_instructions(dataframe) -> dict:
    # dictionary of title (article) to text (list of steps)
    wikihow = dict()

    for idx, row in df.iterrows():
        title = row['title']
        text = cleanText(row['headline']) + " " + cleanText(row['text'])
        if title and text:
            if title in wikihow:
                wikihow[title].append(text)
            else:
                wikihow[title] = [text]
    
    return wikihow

In [8]:
# pairs of (instruction, rank)
def get_instruction_rank(database: dict) -> list:
    instruction_rank = list()
    
    for article, instructions in database.items():
        for idx, step in enumerate(instructions, start=1):
            instruction_rank.append((step, idx / len(instructions)))
    
    return instruction_rank

In [9]:
# convert text instructions to a vector with spacy
def convert_vector(instruction_rank: list, num_examples: int) -> list:
    vector_rank = list()
    
    for step in instruction_rank[:num_examples]:
        instruction1, rank1,instruction2,rank2,tf = step[0][0], step[0][1],step[1][0],step[1][1],step[2]
        doc1 = nlp(instruction1)
        doc2 = nlp(instruction2)
        vector_rank.append(([doc1.vector, rank1],[doc2.vector,rank2],tf))
    
    return vector_rank

In [10]:
wikihow = process_instructions(dataframe=df)

In [11]:
sample = wikihow['How to Sell Fine Art Online']
print(sample)

["Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.", 'Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show your artwor

In [12]:
def addRank(lyst):
    ranked = list()
    for index, element in enumerate(lyst):
        ranked.append((element, index))
    return ranked

In [13]:
def inOrder(lyst):
    ordered = list()
    for element in lyst:
        element = list(element)
        element.append(element[0][1] < element[1][1])
        ordered.append(tuple(element))
    return ordered

In [14]:
def makePairs(lyst):
    perms = list(permutations(addRank(lyst), 2))
    return inOrder(perms)

In [15]:
def makePairsList(wiki):
    pairslist = list()
    for k in wiki.keys():
        pairslist += makePairs(wikihow[k])
    return pairslist
pairs = makePairsList(wikihow)

In [16]:
makePairs(sample)

[(("Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
   0),
  ('Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show

In [17]:
wikihow

{'How to Sell Fine Art Online': ["Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
  'Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saat

In [18]:
# number of articles
len(df)

993

In [19]:
instruction_rank = get_instruction_rank(database=wikihow)

In [20]:
pairs[0]

(("Sell yourself first. Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.",
  0),
 ('Join online artist communities. Depending on what scale you intend to sell your art pieces, you may want to get an account on an online art community or store, like Deviant Art. With 15% -20 % brokerage, you can also find many online art galleries like Art Brokerage, Diva Art Group, or Saatchi Art that will show yo

In [21]:
# work with 100 instructions to start off; make sure all steps are included per article
vector_rank = convert_vector(pairs, num_examples=100)

In [22]:
print(vector_rank)

[([array([ 6.21317625e-02,  3.13543648e-01,  1.74683422e-01, -1.08683109e+00,
        2.51413912e-01, -6.09057903e-01, -3.15163374e-01,  6.49498284e-01,
       -7.59598434e-01,  7.70982444e-01, -1.35766596e-01, -1.88416585e-01,
        2.40615189e-01,  7.73873329e-01,  3.28081660e-02,  4.95044231e-01,
       -9.67470929e-02, -7.72449493e-01,  2.56166846e-01,  6.20652914e-01,
        1.50883794e-01,  1.77884884e-02,  4.63024974e-01, -4.34872180e-01,
        6.05799377e-01,  8.51967275e-01, -6.08667314e-01, -7.58104503e-01,
       -1.20187199e+00, -1.95022821e-01,  1.81690782e-01, -5.88542938e-01,
        2.31737569e-01, -3.38517755e-01, -7.53019035e-01, -7.53962040e-01,
       -4.85269666e-01, -8.75474012e-04, -2.18475044e-01, -2.85865456e-01,
        1.27605903e+00,  2.85630733e-01,  5.35635889e-01, -7.21778154e-01,
        9.47504103e-01, -5.68873405e-01, -2.27390558e-01,  7.83406317e-01,
        3.34879696e-01, -2.42223531e-01, -7.25738347e-01, -1.34587914e-01,
       -8.25055122e-01

In [23]:
# reads and preprocesses text (tokenize text into words, remove punctuation, lowercase, etc) for gensim doc2vec model
# lyst is list of each paragraph/step per article (like sample)
# corpus = collection of documents (in this case collection of instruction paragraphs)
def read_corpus(lyst, tokens_only=False):
    for i, line in enumerate(lyst):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(sample)) #TODO: replace sample with list of all paragraphs in training set
# always set tokens_only=True for test corpus:
#test_corpus = list(read_corpus(TEST_FILE, tokens_only=True)) #TODO: uncomment and replace TEST_FILE with test set 

In [24]:
# training the doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40) # not sure about what the vector size should be and other parameters

model.build_vocab(train_corpus)

2020-11-17 11:05:47,524 : INFO : collecting all words and their counts
2020-11-17 11:05:47,524 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-11-17 11:05:47,525 : INFO : collected 564 word types and 9 unique tags from a corpus of 9 examples and 1529 words
2020-11-17 11:05:47,525 : INFO : Loading a fresh vocabulary
2020-11-17 11:05:47,526 : INFO : effective_min_count=2 retains 185 unique words (32% of original 564, drops 379)
2020-11-17 11:05:47,526 : INFO : effective_min_count=2 leaves 1150 word corpus (75% of original 1529, drops 379)
2020-11-17 11:05:47,527 : INFO : deleting the raw counts dictionary of 564 items
2020-11-17 11:05:47,528 : INFO : sample=0.001 downsamples 82 most-common words
2020-11-17 11:05:47,528 : INFO : downsampling leaves estimated 597 word corpus (52.0% of prior 1150)
2020-11-17 11:05:47,529 : INFO : estimated required memory for 185 words and 50 dimensions: 168300 bytes
2020-11-17 11:05:47,529 : INFO : resetting layer weigh

In [25]:
#print(f"Word 'artist' appeared {model.wv.get_vecattr('artist', 'count')} times in the training corpus.")

In [26]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) # train model on train_corpus

2020-11-17 11:05:47,574 : INFO : training model with 3 workers on 185 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-11-17 11:05:47,576 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 11:05:47,577 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 11:05:47,578 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 11:05:47,578 : INFO : EPOCH - 1 : training on 1529 raw words (607 effective words) took 0.0s, 254933 effective words/s
2020-11-17 11:05:47,579 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 11:05:47,580 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 11:05:47,581 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 11:05:47,582 : INFO : EPOCH - 2 : training on 1529 raw words (601 effective words) took 0.0s, 235137 effective words/s
2020-11-17 11:05:47,583 : INFO : worker thread 

2020-11-17 11:05:47,657 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 11:05:47,658 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 11:05:47,660 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 11:05:47,660 : INFO : EPOCH - 21 : training on 1529 raw words (625 effective words) took 0.0s, 176747 effective words/s
2020-11-17 11:05:47,662 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 11:05:47,664 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17 11:05:47,665 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-17 11:05:47,665 : INFO : EPOCH - 22 : training on 1529 raw words (603 effective words) took 0.0s, 194259 effective words/s
2020-11-17 11:05:47,667 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-17 11:05:47,669 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-17

In [27]:
tokens = gensim.utils.simple_preprocess(sample[0]) #tokens for first step in 'How to Sell Fine Art Online'
tokens

['sell',
 'yourself',
 'first',
 'before',
 'doing',
 'anything',
 'else',
 'stop',
 'and',
 'sum',
 'up',
 'yourself',
 'as',
 'an',
 'artist',
 'now',
 'think',
 'about',
 'how',
 'to',
 'translate',
 'that',
 'to',
 'an',
 'online',
 'profile',
 'be',
 'it',
 'the',
 'few',
 'words',
 'twitter',
 'allows',
 'you',
 'or',
 'an',
 'entire',
 'page',
 'of',
 'indulgence',
 'that',
 'your',
 'own',
 'website',
 'would',
 'allow',
 'you',
 'bring',
 'out',
 'the',
 'most',
 'salient',
 'features',
 'of',
 'your',
 'creativity',
 'your',
 'experience',
 'your',
 'passion',
 'and',
 'your',
 'reasons',
 'for',
 'painting',
 'make',
 'it',
 'clear',
 'to',
 'readers',
 'why',
 'you',
 'are',
 'an',
 'artist',
 'who',
 'loves',
 'art',
 'produces',
 'high',
 'quality',
 'art',
 'and',
 'is',
 'true',
 'champion',
 'of',
 'art',
 'if',
 'you',
 're',
 'not',
 'great',
 'with',
 'words',
 'find',
 'friend',
 'who',
 'can',
 'help',
 'you',
 'with',
 'this',
 'really',
 'important',
 'aspect',


In [28]:
# can use the trained model to infer a vector for any piece of text 
# by passing a list of words to the model.infer_vector function
vector = model.infer_vector(tokens)
print(vector)

[-0.5857965  -0.06106334 -0.26508826 -0.18579628 -0.33508375 -0.76775855
  0.29144016 -0.00404861  0.20746256 -0.47810903 -0.08840881  0.5348068
 -0.18775548  0.23072203  0.01696482 -0.01909244  0.34202895  0.08504286
 -0.38305855 -0.04310068 -0.152386   -0.744613    0.5785257   0.11892451
 -0.6605066  -0.39677238 -0.00140902 -0.38689682  0.5335552   0.19275896
 -0.1083059   0.29847503 -0.03626053 -0.57498103 -0.16093323 -0.41904616
 -0.31134427  0.2355971   0.02838269 -0.53255486  0.2284742   0.0094249
 -0.6767387   0.00805737  0.46503597  0.5148004  -0.06537412  0.52797973
  0.49387485  0.3561448 ]


In [29]:
# TODO: assess model