## Loading Data

In [35]:
import warnings
warnings.filterwarnings("ignore")

In [36]:
import os
import markdown

data = dict()
# Load all the files present in the data folder
for root, dirs, files in os.walk('../data'):
    for file in files:
        with open(os.path.join(root, file), 'rb') as f:
            # print(f.read())
            text = markdown.markdown(f.read())
            data[file] = text
            # break

In [37]:
len(data)

74

## PKE Models

In [38]:
import pke
from pke.lang import stopwords
import string

In [39]:
def keyword(text, method='Yake'):
    if method=='Yake':
        extractor = pke.unsupervised.YAKE()
        stoplist = stopwords.get('english')
        extractor.load_document(input=text,
                                language='en',
                                stoplist=stoplist,
                                normalization=None)
        extractor.candidate_selection(n=3)
        window = 2
        use_stems = False
        extractor.candidate_weighting(window=window,
                                    use_stems=use_stems)
        threshold = 0.8
        keyphrases = extractor.get_n_best(n=10, threshold=threshold)
    elif method=='TextRank':
        pos = {'NOUN', 'PROPN', 'ADJ'}
        extractor = pke.unsupervised.TextRank()
        extractor.load_document(input=text,
                                language='en',
                                normalization=None)
        extractor.candidate_weighting(window=2,
                                    pos=pos,
                                    top_percent=0.33)
        keyphrases = extractor.get_n_best(n=10)
    elif method=='SingleRank':
        pos = {'NOUN', 'PROPN', 'ADJ'}
        extractor = pke.unsupervised.SingleRank()
        extractor.load_document(input=text,
                                language='en',
                                normalization=None)
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(window=10,
                                    pos=pos)
        keyphrases = extractor.get_n_best(n=10)
    elif method=='TopicRank':
        extractor = pke.unsupervised.TopicRank()
        stoplist = list(string.punctuation)
        stoplist += pke.lang.stopwords.get('en')
        extractor.load_document(input=text,
                                stoplist=stoplist)
        pos = {'NOUN', 'PROPN', 'ADJ'}
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(threshold=0.74, method='average')
        keyphrases = extractor.get_n_best(n=10)
    elif method=='PositionRank':
        pos = {'NOUN', 'PROPN', 'ADJ'}
        grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
        extractor = pke.unsupervised.PositionRank()
        extractor.load_document(input=text,
                                language='en',
                                normalization=None)
        extractor.candidate_selection(grammar=grammar,
                                    maximum_word_number=3)
        extractor.candidate_weighting(window=10,
                                    pos=pos)
        keyphrases = extractor.get_n_best(n=10)
    elif method=='MultipartiteRank':
        extractor = pke.unsupervised.MultipartiteRank()
        stoplist = list(string.punctuation)
        stoplist += pke.lang.stopwords.get('en')
        extractor.load_document(input=text,
                                stoplist=stoplist)
        pos = {'NOUN', 'PROPN', 'ADJ'}
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(alpha=1.1,
                                    threshold=0.74,
                                    method='average')
        keyphrases = extractor.get_n_best(n=10)
    else:
        pass
    return keyphrases

In [40]:
import pandas as pd
df = pd.DataFrame(data.items(), columns=['File', 'Text'])
df

Unnamed: 0,File,Text
0,Business Proposal.md,"<p>b""\r\n### Company Name: Instagram\r\n### Ca..."
1,Marketing Plan.md,<p>b'\r\n### Company Name: Instagram\r\n### Ca...
2,Progress Report.md,<p>b'# Instagram Progress Report\r\n\r\n| Proj...
3,About Instagram.md,"<p>b""# About Instagram\r\n\r\n<strong>Overview..."
4,Board of Directors.md,"<p>b""# Board of Directors\r\n\r\nThe Board of ..."
...,...,...
69,employee_payslip_Robert Martinez.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...
70,employee_payslip_Sarah Wilson.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...
71,employee_payslip_Sophia Garcia.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...
72,employee_payslip_William Anderson.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...


In [41]:
keyword_methods = ['Yake', 'TextRank', 'SingleRank', 'TopicRank', 'PositionRank', 'MultipartiteRank']
keywords = []
for method in keyword_methods:
    df[method] = df['Text'].apply(lambda x: keyword(x, method=method))
    keywords.append(df[method].values)



In [42]:
df

Unnamed: 0,File,Text,Yake,TextRank,SingleRank,TopicRank,PositionRank,MultipartiteRank
0,Business Proposal.md,"<p>b""\r\n### Company Name: Instagram\r\n### Ca...","[(social media marketing, 0.002417962165297132...","[(visual content, 0.05635809323251077), (# # #...",[(comprehensive social media marketing service...,"[(strong, 0.10865335318122646), (visual conten...","[(# company name, 0.07905466811226274), (socia...","[(strong, 0.08479113790373548), (brand presenc..."
1,Marketing Plan.md,<p>b'\r\n### Company Name: Instagram\r\n### Ca...,"[(social media marketing, 1.007278240849013e-0...",[(marketing objectives:</strong>\r\n- increase...,"[(current social media marketing trends, 0.088...","[(strong, 0.1533084555366908), (relevant influ...","[(# company name, 0.08739246617167683), (# doc...","[(strong, 0.1237602253731655), (relevant influ..."
2,Progress Report.md,<p>b'# Instagram Progress Report\r\n\r\n| Proj...,"[(direct messaging update, 8.708934970056568e-...","[(| user feedback, 0.19982836181623714), (| us...",[(video duration limit | content moderation po...,"[(developers, 0.22126828410600863), (designers...","[(progress |, 0.2822538647359515), (| integrat...","[(developers, 0.22126828410600868), (designers..."
3,About Instagram.md,"<p>b""# About Instagram\r\n\r\n<strong>Overview...","[(instagram, 0.02607871663367337), (strong, 0....","[(visual content, 0.03562114105346172), (socia...","[(advocacy campaigns\r\n\r\n < strong, 0.05634...","[(users, 0.05817594536900966), (instagram, 0.0...","[(instagram community today, 0.055395335208253...","[(users, 0.05473342842746924), (instagram, 0.0..."
4,Board of Directors.md,"<p>b""# Board of Directors\r\n\r\nThe Board of ...","[(instagram, 0.01280533000974738), (board, 0.0...","[(brand awareness.\r\n\r\n # # contributions, ...","[(# board composition\r\n\r\nthe board, 0.0950...","[(instagram, 0.0979706959405189), (board membe...","[(# board, 0.1028890006977222), (directors\r\n...","[(instagram, 0.08740494042510544), (board memb..."
...,...,...,...,...,...,...,...,...
69,employee_payslip_Robert Martinez.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[(employee payslip, 0.002276984396537819), (ro...","[(# # employee, 0.1877157650996132), (# # net,...","[(# employee information:\r\n- < strong, 0.310...","[(strong, 0.21073999199624255), (july, 0.17844...","[(b'\r\n # employee, 0.2025736705979232), (emp...","[(strong, 0.2021678993609563), (july, 0.179034..."
70,employee_payslip_Sarah Wilson.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[(employee payslip, 0.00227248528019164), (sar...","[(human resources\r\n\r\n # # pay, 0.226215407...","[(# employee information:\r\n- < strong, 0.303...","[(strong, 0.2216125140828499), (june, 0.172587...","[(b'\r\n # employee, 0.21112254798918972), (em...","[(strong, 0.22590488477386228), (june, 0.16043..."
71,employee_payslip_Sophia Garcia.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[(employee payslip, 0.002276984396537819), (so...","[(# # employee, 0.1893321277085305), (# # net,...","[(# employee information:\r\n- < strong, 0.316...","[(strong, 0.22487419242433876), (november, 0.1...","[(b'\r\n # employee, 0.21066626531180033), (so...","[(strong, 0.21571603823755106), (november, 0.1..."
72,employee_payslip_William Anderson.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[(employee payslip, 0.0022734633413415174), (c...","[(# # employee, 0.19218371052598202), (# # net...",[(customer support\r\n\r\n # # pay period:\r\n...,"[(strong, 0.21439941069804597), (september, 0....","[(b'\r\n # employee, 0.2145245199455731), (wil...","[(strong, 0.21620878655182965), (september, 0...."


In [43]:
# df['Yake'][0]
for i in range(len(df)):
    df['Yake'][i] = [x[0] for x in df['Yake'][i]]
    df['TextRank'][i] = [x[0] for x in df['TextRank'][i]]
    df['SingleRank'][i] = [x[0] for x in df['SingleRank'][i]]
    df['TopicRank'][i] = [x[0] for x in df['TopicRank'][i]]
    df['PositionRank'][i] = [x[0] for x in df['PositionRank'][i]]
    df['MultipartiteRank'][i] = [x[0] for x in df['MultipartiteRank'][i]]

In [44]:
df

Unnamed: 0,File,Text,Yake,TextRank,SingleRank,TopicRank,PositionRank,MultipartiteRank
0,Business Proposal.md,"<p>b""\r\n### Company Name: Instagram\r\n### Ca...","[social media marketing, media marketing servi...","[visual content, # # # company, influencer col...",[comprehensive social media marketing services...,"[strong, visual content, brand presence, influ...","[# company name, social media strategy, social...","[strong, brand presence, engagement, visual co..."
1,Marketing Plan.md,<p>b'\r\n### Company Name: Instagram\r\n### Ca...,"[social media marketing, media marketing trend...",[marketing objectives:</strong>\r\n- increase ...,"[current social media marketing trends, social...","[strong, relevant influencers, brand advocacy,...","[# company name, # document title, content mar...","[strong, relevant influencers, marketing plan,..."
2,Progress Report.md,<p>b'# Instagram Progress Report\r\n\r\n| Proj...,"[direct messaging update, explore algorithm en...","[| user feedback, | user adoption, | content m...",[video duration limit | content moderation pol...,"[developers, designers, hold, encryption, secu...","[progress |, | integration issues, moderation ...","[developers, designers, hold, encryption, secu..."
3,About Instagram.md,"<p>b""# About Instagram\r\n\r\n<strong>Overview...","[instagram, strong, users, platform, community...","[visual content, social media platform, user e...","[advocacy campaigns\r\n\r\n < strong, instagra...","[users, instagram, platform, features, strong,...","[instagram community today, social media platf...","[users, instagram, platform, features, strong,..."
4,Board of Directors.md,"<p>b""# Board of Directors\r\n\r\nThe Board of ...","[instagram, board, chief marketing officer, hi...","[brand awareness.\r\n\r\n # # contributions, c...","[# board composition\r\n\r\nthe board, popular...","[instagram, board members, directors, strong, ...","[# board, directors\r\n\r\nthe board, board me...","[instagram, board members, directors, company,..."
...,...,...,...,...,...,...,...,...
69,employee_payslip_Robert Martinez.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[employee payslip, robert martinez, strong, em...","[# # employee, # # net, # # pay, # employee, #...","[# employee information:\r\n- < strong, # pay ...","[strong, july, pay period, earnings, deduction...","[b'\r\n # employee, employee name:</strong, ro...","[strong, july, pay period, earnings, robert ma..."
70,employee_payslip_Sarah Wilson.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[employee payslip, sarah wilson, strong, emplo...","[human resources\r\n\r\n # # pay, # # employee...","[# employee information:\r\n- < strong, # pay ...","[strong, june, pay period, earnings, employee,...","[b'\r\n # employee, employee id:</strong, empl...","[strong, june, pay period, employee, earnings,..."
71,employee_payslip_Sophia Garcia.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[employee payslip, sophia garcia, strong, empl...","[# # employee, # # net, # # pay, # employee, #...","[# employee information:\r\n- < strong, # pay ...","[strong, november, pay period, earnings, deduc...","[b'\r\n # employee, sophia garcia\r\n- <, 5,45...","[strong, november, pay period, earnings, deduc..."
72,employee_payslip_William Anderson.md,<p>b'\r\n# Employee Payslip\r\n\r\n## Employee...,"[employee payslip, customer support, william a...","[# # employee, # # net, # # pay, # #, # employ...",[customer support\r\n\r\n # # pay period:\r\n-...,"[strong, september, pay period, earnings, dedu...","[b'\r\n # employee, william anderson\r\n- <, #...","[strong, september, pay period, employee, earn..."


In [45]:
df.drop('Text', axis=1, inplace=True)

In [46]:
df.to_csv('keywords.csv', index=False)

## Evaluation