# Extract the links, the titles and the paragraphs of the articles from the main article and sort them by similarity to the main article

In [2]:
# Import the necessary libraries
from bs4 import BeautifulSoup
import requests
import numpy as np
from openai import OpenAI
import openai
import os
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer

## Web_scrapping

In [3]:
# Get the main article
requete = requests.get('https://en.wikipedia.org/wiki/machine_learning')
page = BeautifulSoup(requete.text, 'html.parser')
wiki_main = {}
wiki_main['link'] = 'https://en.wikipedia.org/wiki/machine_learning'
wiki_main['title'] = page.find('h1').text
wiki_main['paragraph'] = page.find('p').text
wiki_main

{'link': 'https://en.wikipedia.org/wiki/machine_learning',
 'title': 'Machine learning',
 'paragraph': 'Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]\n'}

In [4]:
# Extract the links of the articles
links = page.find_all('a')
http_links = [f"{link.get('href')}" for link in links if link.get('href') and link.get('href').startswith('/wiki')]  
wiki_list = []
wiki_dict_sans_doublon = []

In [5]:
# Create a list of dictionaries containing the links, titles, and paragraphs of the articles
for link in http_links:
    wiki_dict = {}
    requete = requests.get("https://en.wikipedia.org" + link)
    page = BeautifulSoup(requete.text, 'html.parser')
    h_1 = page.find('h1')
    p_1 = page.find('p')
    if p_1 is not None and p_1.text not in wiki_dict_sans_doublon: # We filter the duplicates
        wiki_dict_sans_doublon.append(p_1.text)
        wiki_dict["link"] = link
        wiki_dict["title"] = h_1.text
        wiki_dict["paragraph"] = p_1.text
        wiki_list.append(wiki_dict)
print(wiki_list[:5])

[{'link': '/wiki/Main_Page', 'title': 'Main Page', 'paragraph': 'Ernest Roberts (21\xa0February 1868\xa0– 2\xa0December 1913) was a Labor member of the South Australian House of Assembly, and then the Australian House of Representatives. Roberts emigrated to Australia from the UK and worked in Port Pirie, South Australia, where he was a member of its town council. In 1896, aged 28, he became the youngest person elected to the House of Assembly and quickly gained a reputation for his oratory. He served in South Africa twice during the Second Boer War, rising to the rank of captain. During his second period of service his term in the South Australian parliament expired. After returning home, he was the editor of a political newspaper before being elected to the House of Assembly again in 1905. He was elected to the federal House of Representatives in a by-election in 1908 and was appointed as an honorary minister in 1911. After a fiery parliamentary debate on 2\xa0December 1913, Roberts 

## Words Embedding

In [6]:
# Create an OpenAI client
client = OpenAI()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [7]:
# Create an embedding of the paragraph and title for the wiki_main article
response = client.embeddings.create(
    input=wiki_main['title'] + wiki_main['paragraph'],
    model="text-embedding-ada-002"
)
wiki_main["embeddings"] = response.data[0].embedding

In [9]:
def jaccard_similarity(A, B):
    #Find intersection of two sets
    nominator = A.intersection(B)

    #Find union of two sets
    denominator = A.union(B)

    #Take the ratio of sizes
    similarity = len(nominator)/len(denominator)
    
    return similarity

In [12]:
# Create an embedding of the paragraph and title for each article in wiki_list
for wiki in wiki_list:
    response = client.embeddings.create(
        input=wiki['title'] + wiki['paragraph'],
        model="text-embedding-ada-002"
    )
    wiki["embeddings"] = response.data[0].embedding
    # Calculate the similarity between the main article and each article in wiki_list using the dot product of their embeddings
    wiki['similarity_embedding_dot_product'] = np.dot(wiki_main['embeddings'], wiki['embeddings'])
    # Calculate the similarity between the main article and each article in wiki_list using the cosine similarity of their embeddings
    wiki['similarity_embedding_cosine_similarity'] = np.dot(wiki_main['embeddings'], wiki['embeddings']) / (np.linalg.norm(wiki_main['embeddings']) * np.linalg.norm(wiki['embeddings']))
    # Calculate the similarity between the main article and each article in wiki_list using the jaccard similarity of their embeddings
    wiki['similarity_embedding_jaccard_similarity'] = jaccard_similarity(set(wiki_main['embeddings']), set(wiki['embeddings']))

In [13]:
# Sort the wiki_list by similarity using the dot product of their embeddings
wiki_list.sort(key=lambda x: x['similarity_embedding_dot_product'], reverse=True)

# Print the top 5 articles by similarity using the dot product of their embeddings
for wiki in wiki_list[:5]:
    print(wiki['similarity_embedding_dot_product'])
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

1.0000000882947506
https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

0.9274254193423537
https://en.wikipedia.org/wiki/Category:Machine_learning
Category:Machine learning
Machine learning is a branch of statistics and computer science which studies algorithms and architectures that learn from observed facts.

0.911577885978255
https://en.wikipedia.org/wiki/Automated_machine_learning
Automated machine lear

In [14]:
# Sort the wiki_list by similarity using the cosine similarity of their embeddings
wiki_list.sort(key=lambda x: x['similarity_embedding_cosine_similarity'], reverse=True)

# Print the top 5 articles by similarity using the cosine similarity of their embeddings
for wiki in wiki_list[:5]:
    print(wiki['similarity_embedding_cosine_similarity'])
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

0.9999999999999998
https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

0.9274253257534554
https://en.wikipedia.org/wiki/Category:Machine_learning
Category:Machine learning
Machine learning is a branch of statistics and computer science which studies algorithms and architectures that learn from observed facts.

0.9115778982802039
https://en.wikipedia.org/wiki/Automated_machine_learning
Automated machine lea

In [15]:
# Sort the wiki_list by similarity using the jaccard similarity of their embeddings
wiki_list.sort(key=lambda x: x['similarity_embedding_jaccard_similarity'], reverse=True)

# Print the top 5 articles by similarity using the jaccard similarity of their embeddings
for wiki in wiki_list[:5]:
    print(wiki['similarity_embedding_jaccard_similarity'])
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

1.0
https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

0.0007079646017699115
https://en.wikipedia.org/wiki/Cheminformatics
Cheminformatics
Cheminformatics (also known as chemoinformatics) refers to the use of physical chemistry theory with computer and information science techniques—so called "in silico" techniques—in application to a range of descriptive and prescriptive problems in the field of chemistr

## Text Vectorization

In [16]:
# Load the stopwords
nltk.download('stopwords')
stop_en = stopwords.words('english')
stop_words_ext = list(stop_en)
vectorizer = CountVectorizer(stop_words=stop_words_ext, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z_-]+\b")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Create the corpus by concatenating the title and the paragraph of each article
corpus = []
for wiki in wiki_list:
    corpus.append(wiki['title'])
    corpus.append(wiki['paragraph'])
print(corpus[:5])

['Machine learning', 'Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]\n', 'Cheminformatics', 'Cheminformatics (also known as chemoinformatics) refers to the use of physical chemistry theory with computer and information science techniques—so called "in silico" techniques—in application to a range of descriptive and prescriptive problems in the field of chemistry, including in its applications to biology and related molecular fields. Such in silico techniques are use

In [18]:
# Fit the vectorizer to the corpus
vectorizer.fit_transform(corpus)

<1152x5344 sparse matrix of type '<class 'numpy.int64'>'
	with 19883 stored elements in Compressed Sparse Row format>

In [19]:
# Create the vector for the main article
wiki_main['vector'] = vectorizer.transform([wiki_main['title'] + wiki_main['paragraph']]).toarray()[0]

In [21]:
# Calculate the similarity between the main article and each article in wiki_list using the dot product of their vectors
for wiki in wiki_list:
    wiki['vector'] = vectorizer.transform([wiki['title'] + wiki['paragraph']]).toarray()[0]
    # Calculate the similarity between the main article and each article in wiki_list using the dot product of their vectors
    wiki['similarity_vector_dot_product'] = np.dot(wiki_main['vector'], wiki['vector'])
    # Calculate the similarity between the main article and each article in wiki_list using the cosine similarity of their vectors
    wiki['similarity_vector_cosine_similarity'] = np.dot(wiki_main['vector'], wiki['vector']) / (np.linalg.norm(wiki_main['vector']) * np.linalg.norm(wiki['vector']))
    # Calculate the similarity between the main article and each article in wiki_list using the jaccard similarity of their vectors
    wiki['similarity_vector_jaccard_similarity'] = jaccard_similarity(set(wiki_main['vector']), set(wiki['vector']))

In [22]:
# Sort the wiki_list by similarity to the main article using the dot product of their vectors
wiki_list.sort(key=lambda x: x['similarity_vector_dot_product'], reverse=True)

# Print the top 5 articles by similarity to the main article using the dot product of their vectors
for wiki in wiki_list[:5]:
    print(wiki['similarity_vector_dot_product'])
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

68
https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

31
https://en.wikipedia.org/wiki/Online_machine_learning
Online machine learning
In computer science, online machine learning is a method of machine learning in which data becomes available in a sequential order and is used to update the best predictor for future data at each step, as opposed to batch learning techniques which generate the best predict

In [23]:
# Sort the wiki_list by similarity to the main article using the cosine similarity of their vectors
wiki_list.sort(key=lambda x: x['similarity_vector_cosine_similarity'], reverse=True)

# Print the top 5 articles by similarity to the main article using the cosine similarity of their vectors
for wiki in wiki_list[:5]:
    print(wiki['similarity_vector_cosine_similarity'])
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

1.0
https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

0.3279680246763151
https://en.wikipedia.org/wiki/Computational_learning_theory
Computational learning theory
In computer science, computational learning theory (or just learning theory) is a subfield of artificial intelligence devoted to studying the design and analysis of machine learning algorithms.[1]

0.3218393429334682
https://en.wikipedia.org/wi

In [24]:
# Sort the wiki_list by similarity to the main article using the jaccard similarity of their vectors
wiki_list.sort(key=lambda x: x['similarity_vector_jaccard_similarity'], reverse=True)

# Print the top 5 articles by similarity to the main article using the jaccard similarity of their vectors
for wiki in wiki_list[:5]:
    print(wiki['similarity_vector_jaccard_similarity'])
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

1.0
https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

1.0
https://en.wikipedia.org/wiki/Neural_Designer
Neural Designer
Neural Designer is a software tool for machine learning based on neural networks, a main area of artificial intelligence research, and contains a graphical user interface which simplifies data entry and interpretation of results.

1.0
https://en.wikipedia.org/wiki/Machine_Learning_(jour