In [2]:
from bs4 import BeautifulSoup as bs
import requests

In [3]:
url="https://vitrinelinguistique.oqlf.gouv.qc.ca/resultats-de-recherche?tx_solr%5Bfilter%5D%5B0%5D=type_stringM%3Abdl&tx_solr%5Bfilter%5D%5B1%5D=type_stringM%3Agdt&tx_solr%5Bq%5D=l%C3%A9sion"
response = requests.get(url)

In [4]:
from urllib.parse import unquote

url_encoded = unquote(url)
url_encoded

'https://vitrinelinguistique.oqlf.gouv.qc.ca/resultats-de-recherche?tx_solr[filter][0]=type_stringM:bdl&tx_solr[filter][1]=type_stringM:gdt&tx_solr[q]=lésion'

In [27]:
# It does the same thing like the previous function
# requests.utils.unquote(url)

'https://vitrinelinguistique.oqlf.gouv.qc.ca/resultats-de-recherche?tx_solr[q]=lésion&tx_solr[filter][0]=type_stringM:bdl&tx_solr[filter][1]=type_stringM:gdt'

In [4]:
html = response.content

In [5]:
soup = bs(html, "lxml")

In [6]:
soup.title

<title>Résultats de recherche</title>

In [7]:
soup.title.getText()

'Résultats de recherche'

## Getting the number of definitions finded

In [11]:
number_str = soup.find("div", class_ = "search-results__count").get_text()

In [12]:
number_str

'\n1715 résultat(s)\n'

In [18]:
import re

numOfDomains = int(re.findall(r'\d+', number_str)[0])

numOfDomains

1715

## Getting the list of domains

In [9]:
domaines_def = soup.find_all("p", class_="result__domaines")

In [22]:
domains_name = []
for domaine in domaines_def:
    domains_name.append(domaine.get_text(strip=True))

In [23]:
import pandas as pd

domains_series = pd.Series(domains_name)
domains_series.value_counts()

médecine                            15
droit                                3
assurance                            2
assurance|sécurité                   2
botanique                            1
appellation de personne|médecine     1
assurance|travail                    1
dtype: int64

## Applying this method to all Terms

In [1]:
import json
with open('allTerms.json') as f:
    allTerms = json.loads(f.read())

In [9]:
allTerms

['qg',
 'williams ka',
 'meta manipulative treatment',
 'actif',
 'alves nogueira d',
 'colonne',
 'manipulation de la colonne vertbrale',
 'ieee',
 'marche',
 'facet joint syndrome',
 'composante physique',
 'shearer hm',
 'suspicion',
 'prevalence',
 'stimulation',
 'exact',
 'liste',
 'ribeiro',
 'traitements anticonvulsivants',
 'patient des effets indsirables',
 'autorit de sant',
 'ulcre perfor',
 'fonctionnelles',
 'prreprise',
 'multimodale',
 'examen clinique du rachis',
 'tomography',
 'liste est non exhaustive',
 'orthopaedic',
 'lyon',
 'sanitarias',
 'protocole de la cochrane',
 'imagerie rachidienne',
 'enceintes',
 'curative',
 'interquartile',
 'Web',
 'mcanismes',
 'critre principal radiofrquence pulse',
 'ngatives',
 'corticodes par voie pidurale',
 'incluses',
 'ha',
 'poinsignon',
 'kennedy n',
 'activit normale',
 'opiodes forts',
 'r0320434htm',
 'almeida',
 'institut national de veille',
 'vise myorelaxante',
 'mouvements actifs spcifiques',
 'irritation des raci

In [10]:
len(allTerms)

7625

In [5]:
def scrap(url, term):
    response = requests.get(url_encoded.replace("lésion", term.replace(" ","+")))
    html = response.content
    soup = bs(html, "lxml")
    domaines_def = soup.find_all("p", class_="result__domaines")
    domains_names = []
    for domaine in domaines_def:
        if domaine.get_text(strip=True).find("|") == -1:
            domains_names.append(domaine.get_text(strip=True))
        else:
            domains_names += domaine.get_text(strip=True).split("|")
        
    return domains_names

We faced here the problem of Temporary Ip Blocks, our IP addresse get blocked while scrapping.

We used these two sources to try solving this problem:

https://www.scraperapi.com/blog/5-tips-for-web-scraping/  
https://medium.com/swlh/improve-your-web-scraper-with-limited-retry-loops-python-35e21730cbf5

In [28]:
import time
from random import randrange

medical_exist = []
for i,term in enumerate(allTerms[6000:7000]):
    successful = False
    while not successful: 
        try:
            if any(x in scrap(url_encoded, term) for x in ['médecine', 'pharmacologie', 'biologie', 'entraînement physique', 'psychologie', 'chimie', 'sport']):
                medical_exist.append(term)
            print(i," ", term)
            time.sleep(randrange(10, 25))
            successful = True
        except requests.exceptions.RequestException as e:
            random_sleep_except = randrange(240,360)
            print("I've encountered an error! I'll pause for"+str(random_sleep_except/60) + " minutes and try again \n")
            time.sleep(random_sleep_except)

0   outcome risk
1   multidisciplinaires
2   mcauley jh
3   ornelas j
4   costa t
5   maher
6   miake
7   jt
8   bienfaits
9   niveau de la re
10   cupping 2018
11   possibilits
12   wong
13   charge du patient lombalgique
14   analyse de dcision
15   weng
16   dlai de consultation
17   of science
18   insidieux
19   scientifiques
20   promoteur haute
21   relle
22   douleur radiculaire
23   cismef cmainfobase
24   layouni
25   socio
26   sfetd
27   sihawong
28   spinale
29   magazines top
30   chronique non moteur
31   muoz
32   doiton
33   interdiction
34   sciences
35   kallewaard jw
36   anesth
37   salix
38   ozone therapy
39   dbut
40   musculoskelet sci pract
41   contenu
42   lger
43   impairments
44   analgsiques non opiodes
45   patients
46   retard
47   biothrapie
48   patiente
49   vaste
50   cage toward thoracique
51   sommaire
52   institut national de recherche
53   antiinflammatoires
54   conventionnelle
55   amrique latine
56   ralistes
57   ncb
58   pain detect
59   v

In [None]:
scrap(url_encoded, "infiltrations pidurales de of science")

In [32]:

len(medical_exist)

292

In [2]:
import json

with open('medicalTerms_1.json') as f:
    medicalTerms_1 = json.loads(f.read())
with open('medicalTerms_2.json') as f:
    medicalTerms_2 = json.loads(f.read())
with open('medicalTerms_3.json') as f:
    medicalTerms_3 = json.loads(f.read())
with open('medicalTerms_4.json') as f:
    medicalTerms_4 = json.loads(f.read())
with open('medicalTerms_5.json') as f:
    medicalTerms_5 = json.loads(f.read())
with open('medicalTerms_6.json') as f:
    medicalTerms_6 = json.loads(f.read())
with open('medicalTerms_7.json') as f:
    medicalTerms_7 = json.loads(f.read())
with open('medicalTerms_8.json') as f:
    medicalTerms_8 = json.loads(f.read())

In [3]:
medicalTerms = medicalTerms_1 + medicalTerms_2 + medicalTerms_3 + medicalTerms_4 + medicalTerms_5 + medicalTerms_6 + medicalTerms_7 + medicalTerms_8

In [6]:
with open('medicalTerms.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(medicalTerms))

In [3]:
with open('medicalTerms.json') as f:
    medicalTerms = json.loads(f.read())

In [4]:
len(medicalTerms)

3629

In [7]:
import pandas as pd

df = pd.DataFrame({'Medical Terms': medicalTerms, 'All Terms': allTerms[:3629]})

df.to_excel("medicalTerms VS AllTerms.xlsx")