# Imports


In [None]:
import pandas as pd

In [None]:
import json

# Ouvrir le fichier JSON
# remarque, dans jupyter, je dois mettre le ../ contrairement au fichier .py 
file_path = '../data/pubmed.json' 
with open(file_path, 'r') as f:
    try:
        data = json.load(f)
        print("JSON chargé avec succès!")
        print(data)
    except json.JSONDecodeError as e:
        print(f"Erreur lors du chargement du JSON: {e}")
    
# print(data)

In [None]:
import json

def load_data(loc_drugs='data/drugs.csv', 
              loc_clinicalTrials='data/clinical_trials.csv', 
              loc_pubmedCSV='data/pubmed.csv',
              loc_pubmedJSON='data/pubmed.json'):

    df_drugs = pd.read_csv(loc_drugs)
    df_clinical_trials = pd.read_csv(loc_clinicalTrials)
    df_pubmed_csv = pd.read_csv(loc_pubmedCSV)

    with open(loc_pubmedJSON, 'r') as f:
        df_json_pubmed = pd.DataFrame(json.load(f))

    df_pumeds = pd.concat([df_pubmed_csv, df_json_pubmed], ignore_index=True)

    return df_drugs, df_clinical_trials, df_pumeds

df_drugs, df_clinical_trials, df_pubmeds = load_data(loc_drugs='../data/drugs.csv', 
              loc_clinicalTrials='../data/clinical_trials.csv', 
              loc_pubmedCSV='../data/pubmed.csv',
              loc_pubmedJSON='../data/pubmed.json')

In [None]:
df_drugs

In [None]:
df_clinical_trials

In [None]:
df_pubmeds

In [None]:
def clean_data(df_drugs, df_clinical_trials, df_pubmed):
    # Nettoyage des données des essais cliniques
    df_clinical_trials.dropna(subset=['scientific_title', 'journal', 'date'], inplace=True)
    try:
        df_clinical_trials = df_clinical_trials[df_clinical_trials["scientific_title"].astype(str).str.contains(r'[a-zA-Z]')]
    except Exception as e:
        print(f"Erreur lors du filtrage des scientific_title: {e}")
    df_clinical_trials['date'] = pd.to_datetime(df_clinical_trials['date'], errors='coerce')
    
    # Nettoyage des données PubMed
    df_pubmed.dropna(subset=['title', 'journal', 'date'], inplace=True)
    df_pubmed['date'] = pd.to_datetime(df_pubmed['date'], errors='coerce')

    # Nettoyage des données des médicaments
    df_drugs.dropna(inplace=True)
    
    return df_drugs, df_clinical_trials, df_pubmed

In [None]:
df_drugs, df_clinical_trials, df_pubmeds = clean_data(df_drugs, df_clinical_trials, df_pubmeds)
if not df_drugs.isnull().values.any():
    print('df_drugs ne contient pas de valuers nulles')
else:
    print('df_drugs contient des valuers nulles')

if not df_clinical_trials.isnull().values.any():
    print('df_clinical_trials ne contient pas de valuers nulles')
else:
    print('df_clinical_trials contient des valuers nulles')

if not df_pubmeds.isnull().values.any():
    print('df_pubmeds ne contient pas de valuers nulles')
else:
    print('df_pubmeds contient des valuers nulles')

In [None]:
df_drugs

In [None]:
df_clinical_trials

In [None]:
filtered_df = df_clinical_trials[df_clinical_trials["scientific_title"].astype(str).str.contains(r'[a-zA-Z]')]
filtered_df

In [None]:
df_pubmeds

In [None]:
import pandas as pd
df_d = pd.DataFrame({
            'atccode': ['A04AD', 'S03AA'],
            'drug': ['DIPHENHYDRAMINE', 'TETRACYCLINE']
        })
df_ct = pd.DataFrame({
            'id': ['NCT01967433', 'NCT04189588'],
            'scientific_title': ['utilisation de Diphenhydramine', 'Hydrochloride  vs Diphenhydramine'],
            'date': ['2020-01-01', '2020-01-01'],
            'journal': ['Journal 1', 'Journal 2']
        })
df_p = pd.DataFrame({
            'id': [1, 2],
            'title': ['qu est ce-que la diphenhydramine', 'l evaluation de la diphenhydramine'],
            'date': ['01/01/2019', '01/01/2019'],
            'journal': ['Journal 3', 'Journal 4']
        })

In [None]:
import re

def find_mentions(title, drugs_df):
    mentions = []
    for _, row in drugs_df.iterrows():
        if re.search(r'\b' + re.escape(row['drug']) + r'\b', title, re.IGNORECASE):
            mentions.append(row['drug'])
    return mentions

def transform_data(drugs_df, clinical_trials_df, pubmed_df):
    mentions = []

    for _, row in clinical_trials_df.iterrows():
        drugs = find_mentions(row['scientific_title'], drugs_df)
        for drug in drugs:
            mentions.append({
                'drug': drug,
                'source': 'clinical_trials',
                'title': row['scientific_title'],
                'date': row['date'],
                'journal': row['journal']
            })

    for _, row in pubmed_df.iterrows():
        drugs = find_mentions(row['title'], drugs_df)
        for drug in drugs:
            mentions.append({
                'drug': drug,
                'source': 'pubmed',
                'title': row['title'],
                'date': row['date'],
                'journal': row['journal']
            })
    
    return mentions


In [None]:
title = "A 44-year-old man with erythema of the face diphenhydramine"
mentions = find_mentions(title, df_d)
print('DIPHENHYDRAMINE', mentions, "DIPHENHYDRAMINE should be found in the title")


In [None]:
print(transform_data(df_d, df_ct, df_p))

In [None]:
from collections import Counter

def journal_with_most_mentions(data):
    journal_mentions = Counter(mention['journal'] for mention in data)
    # print(journal_mentions)
    return journal_mentions.most_common(1)[0][0]

In [None]:
with open('../output/result.json', 'r') as f:
        data = json.load(f)
    
# Extraire le nom du journal qui mentionne le plus de médicaments différents
most_mentioned_journal = journal_with_most_mentions(data)
print("Journal with most mentions:", most_mentioned_journal)

In [None]:
def find_related_drugs(medication, data):
    related_drugs = set()
    for mention in data:
        if mention['source'] == 'pubmed' and mention['drug'] == medication:
            related_drugs.update(related_mention['drug'] for related_mention in data if related_mention['journal'] == mention['journal'] and related_mention['source'] == 'pubmed' and related_mention['drug'] != medication)
    return related_drugs

In [None]:
with open('../output/result.json', 'r') as f:
        data = json.load(f)
related_drugs = find_related_drugs('ATROPINE', data)
print("Related drugs for ATROPINE:", related_drugs)

In [None]:
import pandas as pd

# Exemple de DataFrame
df = pd.DataFrame({
    'id': [1, 1, 2, None],
    'val1': [10, None, 20, None],
    'val2': [None, 30, None, 40],
    'val3': ['oui', None, 'non', 'non']
})
print(df)
# Remplir les NaN avec les valeurs non-NaN des lignes du même groupe
df_filled = df.groupby('id', dropna=False).transform(lambda x: x.ffill().bfill())

# Garder une seule ligne par groupe avec id non-NaN
result = df.drop_duplicates('id').dropna(subset=['id']).assign(
    val1=df_filled['val1'], 
    val2=df_filled['val2'], 
    val3=df_filled['val3']
)

print(result)


In [None]:
import pandas as pd

# Exemple de DataFrame
df = pd.DataFrame({
    'id': [1, 1, 2, None],
    'val1': [10, None, 20, None],
    'val2': [None, 30, None, 40],
    'val3': ['oui', None, 'non', 'non']
})
print(df)

print(df.groupby(['id','val1'], as_index=False).agg({'val2':'first', 'val3':'first'}))
print(df.groupby(['id','val1'], as_index=False).agg({'val2':'last', 'val3':'last'}))
print(df.groupby(['id','val1'], as_index=False).agg({'val2':'sum', 'val3':'sum'}))

In [None]:
print(df.groupby(['id','val1'], as_index=False))

In [None]:
import pandas as pd

# Exemple de DataFrame
df = pd.DataFrame({
    'id': [1, 1, 2, None],
    'val1': [10, None, 20, None],
    'val2': [None, 30, None, 40],
    'val3': ['oui', None, 'non', 'non']
})


In [None]:
df = df.groupby('id').agg({'val3': ''.join}).reset_index()

In [1]:
import pandas as pd
df_clinical_trials = pd.read_csv('../data/clinical_trials.csv')
df_clinical_trials


Unnamed: 0,id,scientific_title,date,journal
0,NCT01967433,Use of Diphenhydramine as an Adjunctive Sedati...,1 January 2020,Journal of emergency nursing
1,NCT04189588,Phase 2 Study IV QUZYTTIR™ (Cetirizine Hydroch...,1 January 2020,Journal of emergency nursing
2,NCT04237090,,1 January 2020,Journal of emergency nursing
3,NCT04237091,Feasibility of a Randomized Controlled Clinica...,1 January 2020,Journal of emergency nursing
4,NCT04153396,Preemptive Infiltration With Betamethasone and...,1 January 2020,Hôpitaux Universitaires de Genève
5,NCT03490942,Glucagon Infusion in T1D Patients With Recurre...,25/05/2020,
6,,Glucagon Infusion in T1D Patients With Recurre...,25/05/2020,Journal of emergency nursing
7,NCT04188184,Tranexamic Acid Versus Epinephrine During Expl...,27 April 2020,Journal of emergency nursing\xc3\x28


In [2]:
import numpy as np
import math

all=[]
l1=[]
l2=[]
for i in range(len(df_clinical_trials)):
    if df_clinical_trials.loc[i].isnull().sum() != 0:
        l1=df_clinical_trials.loc[i].to_list()
        # print("l1 ===", l1)
        for j in range(i+1, len(df_clinical_trials)):
            l2 = df_clinical_trials.loc[j].to_list()
            # print("da ===", list(set(l1).symmetric_difference(set(l2))))
            diff = len(list(set(l1).symmetric_difference(set(l2))))
            if diff < len(l2):
                difference = list(set(l2) - set(l1))[::-1]
                # print(difference)
                # print("l2 ===", l2)
                # print("do ===", list(set(l1).symmetric_difference(set(l2))))
                cpt=0
                for i in range(len(l1)):
                    # print(l1[i], type(l1[i]))
                    if pd.isna(l1[i]):
                        l1[i]=difference[cpt]
                        cpt=cpt+1
            break
        all.append(l1)
        # print(all)

cpt = 0
for row in all:
    df_clinical_trials.loc[len(df_clinical_trials)] = row

df_clinical_trials=df_clinical_trials.drop_duplicates()

print(df_clinical_trials)
    

            id                                   scientific_title  \
0  NCT01967433  Use of Diphenhydramine as an Adjunctive Sedati...   
1  NCT04189588  Phase 2 Study IV QUZYTTIR™ (Cetirizine Hydroch...   
2  NCT04237090                                                      
3  NCT04237091  Feasibility of a Randomized Controlled Clinica...   
4  NCT04153396  Preemptive Infiltration With Betamethasone and...   
5  NCT03490942  Glucagon Infusion in T1D Patients With Recurre...   
6          NaN  Glucagon Infusion in T1D Patients With Recurre...   
7  NCT04188184  Tranexamic Acid Versus Epinephrine During Expl...   
8  NCT03490942  Glucagon Infusion in T1D Patients With Recurre...   

             date                               journal  
0  1 January 2020          Journal of emergency nursing  
1  1 January 2020          Journal of emergency nursing  
2  1 January 2020          Journal of emergency nursing  
3  1 January 2020          Journal of emergency nursing  
4  1 January 2020  

In [1]:
type('lquf')

str