In [3]:
#Import Libraries
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, pairwise
import nltk
from nltk.tokenize import word_tokenize
import os
import spacy
import gensim
import logging
from itertools import permutations
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from scipy import spatial
import random
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt

In [3]:
#Library Configs
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load('en_core_web_sm')

In [6]:
#Import Dataset
original = pd.read_csv('~/Downloads/wikihowSep.csv')
original.head()

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [6]:
#Remove Null Records
df = original.dropna()

In [7]:
#remove extra spaces, newlines, and incorrect ending punctuation
def cleanText(string) -> str:
    lyst = string.split()
    return " ".join(lyst).strip(";")

# dictionary of title (article) to text (list of steps)
def process_instructions(dataframe) -> dict:
    wikihow = dict()

    for idx, row in df.iterrows():
        title = row['title']
        text = cleanText(row['headline']) + " " + cleanText(row['text'])
        if title and text:
            if title in wikihow:
                wikihow[title].append(text)
            else:
                wikihow[title] = [text]
    
    return wikihow

In [11]:
# Clean Data
processed_data = process_instructions(df)

# Remove Singleton Titles
cleaned_data = dict()
for title in processed_data:
    if len(processed_data[title]) > 1:
        cleaned_data[title] = processed_data[title]
        
#print(len(processed_data))
#print(len(cleaned_data))

In [None]:
# Save cleaned data
file = open("cleaned.json", "w")
file.write(json.dumps(cleaned_data, indent = 4))

In [4]:
# Read cleaned data
file = open("cleaned.json", "r")
content = json.loads(file.read())

In [6]:
# Randomize cleaned data
keys = list(content.keys())
random.shuffle(keys)

# Select train/test data
train_data = dict()
for i in range(0, 10000):
    train_data[keys[i]] = content[keys[i]]

test_data = dict()
for i in range(10000, 12000):
    test_data[keys[i]] = content[keys[i]]

In [4]:
# Save train/test data
file = open("train.json", "w")
file.write(json.dumps(train_data, indent = 4))

file = open("test.json", "w")
file.write(json.dumps(test_data, indent = 4))

NameError: name 'train_data' is not defined

In [5]:
# Add Rank of Each Step in List (starting with zero)
def addRank(lyst):
    ranked = list()
    for index, element in enumerate(lyst):
        ranked.append((element, index))
    return ranked

# Add whether steps are in order (first < second)
def inOrder(lyst):
    ordered = list()
    for element in lyst:
        element = list(element)
        element.append(element[0][1] < element[1][1])
        ordered.append(tuple(element))
    return ordered

In [6]:
# Permute list of steps to make all possible pairs of steps
def makePairs(lyst):
    perms = list(permutations(addRank(lyst), 2))
    return inOrder(perms)

# Makes all permutations from a given dictionary
def makePairsList(wiki):
    pairslist = list()
    for k in wiki.keys():
        pairslist += makePairs(wiki[k])
    return pairslist

In [None]:
# Make Pairs from Dataset
train_pairs = makePairsList(train_data)
test_pairs = makePairsList(test_data)

In [None]:
train_pairs[0]