# Scraping _OPEN MEDICAMENTS_ (via API)

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import string
import re
import itertools
from decimal import Decimal

In [2]:
regex = re.compile('\d+')
regex_nom = re.compile('[A-Z ]+')
regex_dose = re.compile('\d+,?\d* ?m?[g|l]\/?m?l?')
regex_ans = re.compile('(\d+) ans')
regex_kg = re.compile('(\d+) kg')

In [3]:
def get_info(codeCIS):
    url = 'https://www.open-medicaments.fr/api/v1/medicaments/' + codeCIS
    result = requests.get(url)
    response_result = json.loads(result.text)
    
    try:
        libelle = response_result['presentations'][0]['libelle']
        quantite = int(regex.findall(libelle)[0])
    except:
        quantite = np.nan
        
    try:
        date = response_result['presentations'][0]['dateDeclarationCommercialisation']
    except:
        date = ''

    try:
        denomination = response_result['denomination']
    except:
        denomination = ''
        
    try:
        nom = regex_nom.findall(denomination)[0]
        etiquette = denomination.replace(nom + ',', '').replace(nom, '')
    except:
        nom = ''
        
    try:
        nom = regex_nom.findall(denomination)[0]
    except:
        nom = ''
    
    try:
        dose_unite = regex_dose.findall(denomination)
        unite = dose_unite[0].split(' ')[1]
        dose = Decimal(dose_unite[0].split(' ')[0].replace(',', '.'))
        if unite == 'g': dose *= 1000
    except:
        dose = np.nan
    
    try:
        indication = response_result['indicationsTherapeutiques']
        indic_ans = regex_ans.findall(indication)
        indic_ans = np.nan if indic_ans == [] else indic_ans[0]
        indic_kg = regex_kg.findall(indication)
        indic_kg = np.nan if indic_kg == [] else indic_kg[0]
    except:
        indic_ans = np.nan
        indic_kg = np.nan
    
    return pd.DataFrame([[codeCIS,
                          nom,
                          etiquette,
                          dose,
                          quantite,
                          pd.to_datetime(date),
                          indic_ans,
                          indic_kg]],
                        columns=col)

In [4]:
def get_codesCIS():
    codesCIS = []
    for letter in string.ascii_lowercase:
        search_url =  "https://www.open-medicaments.fr/api/v1/medicaments?query=" + letter
        result_search = requests.get(search_url)
        response = json.loads(result_search.text)
        codesCIS.append(pd.DataFrame.from_dict(response).codeCIS.values.tolist())

    return list(itertools.chain.from_iterable(codesCIS))

In [5]:
def get_infos_medicaments():
    infos = pd.DataFrame(columns=col)
    for medicament in get_codesCIS():
        infos = infos.append(get_info(medicament), ignore_index=True)
    infos.set_index('codeCIS', inplace=True)
    
    return infos

In [6]:
col = ["codeCIS", "nom", "etiquette", "dose", "quantite", "date", "indic_age_min", "indic_kg_min"]

results = get_infos_medicaments()

In [7]:
results

Unnamed: 0_level_0,nom,etiquette,dose,quantite,date,indic_age_min,indic_kg_min
codeCIS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
66862795,ALPHACAINE N,solution injectable à usage dentaire,,1,1981-01-01,,
63092688,ACECLOFENAC MYLAN,"100 mg, comprimé pelliculé",100,30,2008-04-08,,
62887979,ACETYLLEUCINE BIOGARAN,"500 mg, comprimé",500,30,2012-02-01,,
65319558,ACECLOFENAC QUALIMED,"100 mg, comprimé pelliculé",100,30,2012-07-17,,
69771614,ACARBOSE SANDOZ,"50 mg, comprimé",50,90,2015-08-21,,
60946941,ACARBOSE MYLAN,"100 mg, comprimé sécable",100,90,2014-02-26,,
65999182,ADAPALENE TEVA,"0,1 POUR CENT, gel",,30,2013-07-17,,
61683272,ACICLOVIR ALMUS,"200 mg, comprimé",200,25,2015-02-05,,
64728712,ABUFENE,"400 mg, comprimé",400,60,2013-03-27,,
68933636,ACTIVIR,"5 POUR CENT, crème",,1,1997-01-19,,


In [8]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 520 entries, 66862795 to 63083116
Data columns (total 7 columns):
nom              520 non-null object
etiquette        520 non-null object
dose             339 non-null object
quantite         493 non-null object
date             493 non-null datetime64[ns]
indic_age_min    59 non-null object
indic_kg_min     6 non-null object
dtypes: datetime64[ns](1), object(6)
memory usage: 32.5+ KB


In [10]:
results.to_csv('/Users/antoinehirtz/Documents/openmedicaments.csv', encoding='utf-8-sig')