# Extract the links, the titles and the paragraphs of the articles from the main article and sort them by similarity to the main article

In [132]:
# Import the necessary libraries
from bs4 import BeautifulSoup
import requests
import numpy as np
from openai import OpenAI
import openai
import os
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer

## Web_scrapping

In [133]:
# Get the main article
requete = requests.get('https://en.wikipedia.org/wiki/machine_learning')
page = BeautifulSoup(requete.text, 'html.parser')
wiki_main = {}
wiki_main['link'] = 'https://en.wikipedia.org/wiki/machine_learning'
wiki_main['title'] = page.find('h1').text
wiki_main['paragraph'] = page.find('p').text
wiki_main

{'link': 'https://en.wikipedia.org/wiki/machine_learning',
 'title': 'Machine learning',
 'paragraph': 'Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]\n'}

In [134]:
# Extract the links of the articles
links = page.find_all('a')
http_links = [f"{link.get('href')}" for link in links if link.get('href') and link.get('href').startswith('/wiki')]  
wiki_list = []
wiki_dict_sans_doublon = []

In [135]:
# Create a list of dictionaries containing the links, titles, and paragraphs of the articles
for link in http_links:
    wiki_dict = {}
    requete = requests.get("https://en.wikipedia.org" + link)
    page = BeautifulSoup(requete.text, 'html.parser')
    h_1 = page.find('h1')
    p_1 = page.find('p')
    if p_1 is not None and p_1.text not in wiki_dict_sans_doublon: # We filter the duplicates
        wiki_dict_sans_doublon.append(p_1.text)
        wiki_dict["link"] = link
        wiki_dict["title"] = h_1.text
        wiki_dict["paragraph"] = p_1.text
        wiki_list.append(wiki_dict)
print(wiki_list[:5])

[{'link': '/wiki/Main_Page', 'title': 'Main Page', 'paragraph': 'Florence Petty (1\xa0December 1870\xa0– 18\xa0November 1948) was a Scottish social worker, cookery writer and broadcaster. During the 1900s she undertook social work in the deprived area of Somers Town in North London, demonstrating for working-class women how to cook inexpensive and nutritious foods. Much of the instruction was done in their homes. She published cookery-related works aimed at those also involved in social work, and a cookery book and pamphlet aimed at the public. From 1914 until the mid-1940s she toured Britain giving lecture-demonstrations of cost-efficient and nutritious ways to cook, including dealing with food shortages during the First World War. In the late 1920s and early 1930s, she was a BBC broadcaster on food and budgeting. Petty worked until she was in her seventies. She is considered to be a pioneer of social work innovations. Her approach to teaching the use of cheap nutritious food was a pr

## Words Embedding

In [136]:
# Create an OpenAI client
client = OpenAI()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [137]:
# Create an embedding of the paragraph and title for the wiki_main article
response = client.embeddings.create(
    input=wiki_main['title'] + wiki_main['paragraph'],
    model="text-embedding-ada-002"
)
wiki_main["embeddings"] = response.data[0].embedding

In [139]:
# Create an embedding of the paragraph and title for each article in wiki_list
for wiki in wiki_list:
    response = client.embeddings.create(
        input=wiki['title'] + wiki['paragraph'],
        model="text-embedding-ada-002"
    )
    wiki["embeddings"] = response.data[0].embedding

In [141]:
# Calculate the distance between the main article and each article in wiki_list using the dot product of their embeddings
for wiki in wiki_list:
    wiki['distance_embedding'] = np.dot(wiki_main['embeddings'], wiki['embeddings'])

In [161]:
# Sort the wiki_list by distance to the main article
wiki_list.sort(key=lambda x: x['distance_embedding'], reverse=True)

In [162]:
# Print the links and paragraphs of the articles
for wiki in wiki_list[:5]:
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

https://en.wikipedia.org/wiki/Category:Machine_learning
Category:Machine learning
Machine learning is a branch of statistics and computer science which studies algorithms and architectures that learn from observed facts.

https://en.wikipedia.org/wiki/Automated_machine_learning
Automated machine learning
Automated machine learning (AutoML) is the process 

## Text Vectorization

In [144]:
# Load the stopwords
nltk.download('stopwords')
stop_en = stopwords.words('english')
stop_words_ext = list(stop_en)
vectorizer = CountVectorizer(stop_words=stop_words_ext, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z_-]+\b")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [145]:
# Create the corpus by concatenating the title and the paragraph of each article
corpus = []
for wiki in wiki_list:
    corpus.append(wiki['title'])
    corpus.append(wiki['paragraph'])
print(corpus[:5])

['Machine learning', 'Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]\n', 'Category:Machine learning', 'Machine learning is a branch of statistics and computer science which studies algorithms and architectures that learn from observed facts.\n', 'Automated machine learning']


In [146]:
# Fit the vectorizer to the corpus
vectorizer.fit_transform(corpus)

<1150x5329 sparse matrix of type '<class 'numpy.int64'>'
	with 19858 stored elements in Compressed Sparse Row format>

In [147]:
# Create the vector for the main article
wiki_main['vector'] = vectorizer.transform([wiki_main['title'] + wiki_main['paragraph']]).toarray()[0]

In [148]:
# Calculate the distance between the main article and each article in wiki_list using the dot product of their vectors
for wiki in wiki_list:
    wiki['vector'] = vectorizer.transform([wiki['title'] + wiki['paragraph']]).toarray()[0]
    wiki['distance_vector'] = np.dot(wiki_main['vector'], wiki['vector'])

In [159]:
# Sort the wiki_list by distance to the main article
wiki_list.sort(key=lambda x: x['distance_vector'], reverse=True)

In [160]:
# Print the links and paragraphs of the articles
for wiki in wiki_list[:5]:
    print("https://en.wikipedia.org" + wiki['link'])
    print(wiki['title'])
    print(wiki['paragraph'])

https://en.wikipedia.org/wiki/Machine_learning
Machine learning
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can effectively generalize and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to large language models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5]

https://en.wikipedia.org/wiki/Online_machine_learning
Online machine learning
In computer science, online machine learning is a method of machine learning in which data becomes available in a sequential order and is used to update the best predictor for future data at each step, as opposed to batch learning techniques which generate the best predictor by 