# The algorithm

## Installing packages

In [1]:
import PyPDF2
import re
import requests
import pandas as pd
from itertools import combinations 
from itertools import permutations
from itertools import chain
from nltk.corpus import stopwords

## - placeholder -

In [2]:
id_num = 1
id_pdf = "id-{}".format(id_num)

## Step 1: PDF downloader 

In [3]:
def download_pdf(url, file_name, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
    else:
        print(response.status_code)

## Pulling the PDF

In [4]:
if __name__ == "__main__":
    # Define HTTP Headers
    headers = {
        "User-Agent": "Chrome/51.0.2704.103"
    }
    # Define URL
    url = "https://quantinar.s3.eu-west-3.amazonaws.com/" \
          "courselet_components%2F214%2F1649337741164476519520220205+" \
          "SFE+Slides+Chapter+2.1+The+Basics+of+Option+Management.pdf"
    # Define image file name
    file_name = "{}.pdf".format(id_pdf)
    # Download image
    download_pdf(url, file_name, headers)

## Step 2: String creator

In [5]:
# opening the file
imported_pdf = open(file_name, 'rb')
# convert PDF to readable file
transformed_pdf = PyPDF2.PdfFileReader(imported_pdf)
# get number of pages
totalpages = transformed_pdf.numPages
# read the data and store in a list
pdf_output = [transformed_pdf.getPage(i) for i in range(totalpages)]
# extract result
pdf_output = [pdf_output[i].extractText() for i in range(totalpages)]

## Step 3: Cleaning

In [7]:
# cleaning URLs
pdf_output = [re.sub(pattern = "http[^ ]*", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
# cleaning symbols
pdf_output = [re.sub(pattern = "(\)|\(|,|\.|!|=|:|\[|\]|\{|\}|\'|\"|#|<|>|\%|\&|\?|\*|\/|-|\$|\+|\d)", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
# cleaning multispaces
pdf_output = [re.sub(pattern = "\s{2,}", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
# cleaning out 1-worders
pdf_output = [re.sub(pattern = " \w ", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
# lower-casing
pdf_output = [pdf_output[i].lower() for i in range(totalpages)]

## Step 4: Word list

In [8]:
# split to a list
word_list = [pdf_output[i].split(" ") for i in range(totalpages)]
# converting to a dataframe
word_list = pd.DataFrame(word_list)
# one-word section
one_word_list = [word_list.iloc[j, i] for j in range(totalpages) for i in range(len(word_list))]
# two-word section
two_word_list = [[word_list.iloc[j, i], word_list.iloc[j, i+1]] for j in range(totalpages)  for i in range(len(word_list) - 1)]
two_word_permutation_list = [[p for p in permutations(two_word_list[k])][1:] for k in range(len(two_word_list))]
two_word_permutation_set = set(list(chain(*two_word_permutation_list)))
# three-word section
three_word_list = [[word_list.iloc[j, i], word_list.iloc[j, i+1], word_list.iloc[j, i+2]] for j in range(totalpages) for i in range(len(word_list) - 2)]
three_word_permutation_list = [[p for p in permutations(three_word_list[k])][1:] for k in range(len(three_word_list))]
three_word_permutation_set = set(list(chain(*three_word_permutation_list)))

## Step 5: Three-word occurrance matrix

In [9]:
# copying the data
words = three_word_list.copy()
# converting to a dataframe
words = pd.DataFrame(three_word_list)
# creating the three-word combinations as one string
words = [words.iloc[i,0] + " " + words.iloc[i,1] + " " + words.iloc[i,2] for i in range(len(three_word_list)) if words.iloc[i,].isna().any() == False]
# crating the dictionary
dictionary_three_word = dict()

In [10]:
# counting word occurances
for word in words:
    if word in dictionary_three_word:
        dictionary_three_word[word] = dictionary_three_word[word] + 1
    else:
        dictionary_three_word[word] = 1

In [11]:
# creating the occurance matrix
dictionary_three_words = dictionary_three_word.items()
dictionary_three_list = list(dictionary_three_words)
occurrence_three_matrix = pd.DataFrame(dictionary_three_list)
occurrence_three_matrix = occurrence_three_matrix.rename(columns={0:"word", 1:"occurance"})

# clean of NaNs
occurrence_three_matrix = occurrence_three_matrix.loc[occurrence_three_matrix.word.isna() == False, ]
occurrence_three_matrix = occurrence_three_matrix.loc[occurrence_three_matrix.word != "None", ]

# sort values
occurrence_three_matrix = occurrence_three_matrix.sort_values("occurance", ascending=False)

# re-indexing
occurrence_three_matrix['index'] = range(len(occurrence_three_matrix))
occurrence_three_matrix = occurrence_three_matrix.set_index('index')

## Step 6: Two-word occurrance matrix

In [12]:
# copying the data
words = two_word_list.copy()
# converting to a dataframe
words = pd.DataFrame(two_word_list)
# creating the three-word combinations as one string
words = [words.iloc[i,0] + " " + words.iloc[i,1] for i in range(len(two_word_list)) if words.iloc[i,].isna().any() == False]
# crating the dictionary
dictionary_two_word = dict()

In [13]:
# counting word occurances
for word in words:
    if word in dictionary_two_word:
        dictionary_two_word[word] = dictionary_two_word[word] + 1
    else:
        dictionary_two_word[word] = 1

In [14]:
# creating the occurance matrix
dictionary_two_words = dictionary_two_word.items()
dictionary_three_list = list(dictionary_two_words)
occurrence_two_matrix = pd.DataFrame(dictionary_three_list)
occurrence_two_matrix = occurrence_two_matrix.rename(columns={0:"word", 1:"occurance"})

# clean of NaNs
occurrence_two_matrix = occurrence_two_matrix.loc[occurrence_two_matrix.word.isna() == False, ]
occurrence_two_matrix = occurrence_two_matrix.loc[occurrence_two_matrix.word != "None", ]

# sort values
occurrence_two_matrix = occurrence_two_matrix.sort_values("occurance", ascending=False)

# re-indexing
occurrence_two_matrix['index'] = range(len(occurrence_two_matrix))
occurrence_two_matrix = occurrence_two_matrix.set_index('index')

## Step 7: One-word occurrance matrix

In [15]:
# copying the data
words = one_word_list.copy()
# creating the three-word combinations as one string
words = [x for x in words if x != ""]
words = [x for x in words if x != " "]    
# crating the dictionary
dictionary_one_word = dict()

In [17]:
# counting word occurances
for word in words:
    if word in dictionary_one_word:
        dictionary_one_word[word] = dictionary_one_word[word] + 1
    else:
        dictionary_one_word[word] = 1

In [18]:
# creating the occurance matrix
dictionary_one_word = dictionary_one_word.items()
occurrence_one_matrix = pd.DataFrame(dictionary_one_word)
occurrence_one_matrix = occurrence_one_matrix.rename(columns={0:"word", 1:"occurance"})

# clean of NaNs
occurrence_one_matrix = occurrence_one_matrix.loc[occurrence_one_matrix.word.isna() == False, ]
occurrence_one_matrix = occurrence_one_matrix.loc[occurrence_one_matrix.word != "None", ]

# sort values
occurrence_one_matrix = occurrence_one_matrix.sort_values("occurance", ascending=False)

# re-indexing
occurrence_one_matrix['index'] = range(len(occurrence_one_matrix))
occurrence_one_matrix = occurrence_one_matrix.set_index('index')

# Buiding the copula

In [20]:
courselist = (
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F175%2F163757167020201210_Liu_crypto_p2p_lending.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F272%2F1654160257Lesson1-1.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F273%2F1654160288Lesson1-2.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F274%2F1654160327Lesson1-3.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F275%2F1654160374Lesson1-4.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F276%2F1654160475Lesson1-5.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F277%2F1654160518Lesson1-6.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F279%2F1654251498Lesson2-1.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F278%2F1654160549Lesson1-7.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F280%2F1654251511Lesson2-2.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F281%2F1654251525Lesson2-3.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F103%2F20210303+IA+METIS+Reinforcement+Learning.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F107%2F1636712642CATE_meets_ML_Presentation.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F120%2F163458263420190429+Hae+Ni+LDA+DTM.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F120%2F163646337920210921+Hae+Ni+LDA.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F121%2F163646358220210708+Hae+Ni+LDA+extensions.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F108%2F163595835420210530+METIS+WANG+Kalman+Filter.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F110%2F1632126441nodalida2021_summaryQuality_slides.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F112%2F163664661320211013+Ren+LI+Hae+Expectile+FRM.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F114%2F1635233254Shapley.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F118%2F1636625638FRM%40EM.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F119%2F163231165920210324+Wan+Hae+Li+k-expectile+clustering.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F130%2F1633104997PAC.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F134%2F163368764620210923+Mer+Hae+GAN+Generative+Adversarial+Networks.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F139%2F163402565120211012+Kho+Hae+Trespassing+random+forests+with+a+pointed+stick+for+self+defence.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F217%2F1644582711Berlin_short_course.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F235%2F1649426301Variable+importance+measures+for+RF+.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F179%2F16376558632021122+SBA+JW+Hae+EPF++Quantinar.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F186%2F1645194357Presentation_Quantinar_with_videos.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F306%2FBarHan2021_talk.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F181%2F163826827620211130+LI+Hae+Case+based+Bancruptcy+prediction.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F224%2F164728525020220305+LI+Electricity+Market+Coupling.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F196%2F1642599075163458186420200403+METIS+Kho+Hae+Spectral+Clustering+course.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F197%2F1642599236164171878820211207+Hae+Zin+Hierarchical+Clustering+course.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F198%2F1642599289163231165920210324+Wan+Hae+Li+k-expectile+clustering+course.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F200%2F1642686124163527012720210526+SAE+NAG+HAE+SIZ+Understanding+jumps+in+high+frequency+digital+asset+markets_course.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F201%2F16426862241636625638FRM%40EM_course.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F202%2F1642686343163774999520210912+Hae+Li+Tao+Dynamic+Crypto+Networks_course.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F189%2F163958119120211130+Hae+Wan+Kot+ComputerMuseum.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F183%2F1643806658KDE+ill-posed+problems.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F167%2F163699399420211115+Liu+Word+Embeddings+2.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F157%2F1642778303introduction_data_science.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F180%2F16377653596.+model+assessment+-+part+4+-+appendix.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F128%2F163707553520210331_METIS_Hel_GANs_for_Time_Series.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F127%2F163458156320190528+Cea+Hae+Scagnostics.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F117%2F163458186420200403+METIS+Kho+Hae+Spectral+Clustering.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F106%2F163458175020200914+Hae+DS2+Data+Science+%26+Digital+Society.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F190%2F1640038524Instruction+for+Creating+Quantlets.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F144%2F1636624210NNCSR_Slides.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F184%2F163888970320211207+Zin+Reu+Hae+USC+Quantinar+40+min+PDF.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F133%2F163707963820210525_Hae_Xia_Crypto_Indices-2.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F129%2F163458163120200915+Kim+Hae+Tri+VCRIX.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F123%2F163664570620210922+Mat+Pac+Hae+guide+hedging+CC.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F122%2F163283000220210923+Cul+Hae+Pet+Xia+Cryptocurrency+as+an+asset+class.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F116%2F163774999520210912+Hae+Li+Tao+Dynamic+Crypto+Networks.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F113%2F1632580703FRM+for+Cryptos.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F101%2F163170747120210914+Reu+DSF+Digital+Surrogate+Finance+Doc.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F97%2F163458105020210808+METIS+Win+Pricing+Kernel+Risk+Premium.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F96%2F163299370720210908_CRC21_Hae_Rodeo_or_Ascot.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F96%2F1635155323202109_RoA.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F79%2F164604019720210502+Hae+Har+Reu+Understanding+CCs.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F241%2F1650632942Biographical+Background+Information.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F210%2F164390414020220130+METIS+Gua+Hae+Model+Selection+Criteria.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F194%2F164171878820211207+Hae+Zin+Hierarchical+Clustering.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F170%2F163709451820211117+Hae+Qia+Network+Centrality.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F142%2F163627928020211107+Hae+Iva+Mat+Delaunay+Triangulation_A_Shape.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F135%2F1649084960Chapter+1.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F136%2F1649094328Chapter+2.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F137%2F1633950248Chapter+3.pdf',
'https://quantinar.s3.eu-west-3.amazonaws.com/courselet_components%2F138%2F1649094430Chapter+4.pdf'
)

In [None]:
courselist