# **Making Dataset A and H**

In [7]:
import pandas as pd

# Specify encoding as ISO-8859-1 (Latin-1) or Windows-1252
dfa = pd.read_csv("/content/Classe_A_per_principio_attivo_15-10-2024.csv",  delimiter=';', encoding='ISO-8859-1')  # or encoding='latin1'
dfh = pd.read_csv("/content/Classe_H_per_principio_attivo_15-10-2024.csv",  delimiter=';', encoding='ISO-8859-1')  # or encoding='latin1'
dfh = dfh.rename(columns={'Codice \nAIC': 'AIC'})

dfa['Class'] = 'A'
dfh['Class'] = 'H'

columns_to_keep = ['Principio Attivo', 'Descrizione Gruppo', 'Denominazione e Confezione', 'Titolare AIC', 'AIC', 'Codice Gruppo Equivalenza', 'Class']

# Concatenate the two dataframes
df_merged = pd.concat([dfa, dfh], ignore_index=True)

df_merged = df_merged[columns_to_keep]


# Check for duplicate rows
duplicates = df_merged[df_merged.duplicated()]
print("Duplicate rows:")
print(duplicates)

# Check for missing values (NaN) in any column
missing_values = df_merged[df_merged.isnull().any(axis=1)]
print("\nRows with missing values:")
print(missing_values)

# You can also check for missing values per column
print("\nMissing values per column:")
print(df_merged.isnull().sum())


# prompt: add these columns to dataset in order (4.1 Indicazioni terapeutiche, 4.2 Posologia e modo di somministrazione, 4.3 Contraindications, 4.4 Special warnings and precautions for use, 4.5 Interactions with other medicinal products, 4.6 Fertility, pregnancy and lactation, 4.7 Effects on ability to drive and use machines, 4.8 Undesirable effects (side effects), 4.9 Overdose, 6.2 Incompatibilities, )

# Add new columns with default empty values
new_columns = [
    '4.1 Indicazioni terapeutiche',
    '4.2 Posologia e modo di somministrazione',
    '4.3 Contraindications',
    '4.4 Special warnings and precautions for use',
    '4.5 Interactions with other medicinal products',
    '4.6 Fertility, pregnancy and lactation',
    '4.7 Effects on ability to drive and use machines',
    '4.8 Undesirable effects (side effects)',
    '4.9 Overdose',
    '6.2 Incompatibilities'
]

for col in new_columns:
    df_merged[col] = ''

df_merged['URL'] = ''

df_merged.to_csv('filtered_dataset_A_H.csv', index=False)

Duplicate rows:
Empty DataFrame
Columns: [Principio Attivo, Descrizione Gruppo, Denominazione e Confezione, Titolare AIC, AIC, Codice Gruppo Equivalenza, Class]
Index: []

Rows with missing values:
Empty DataFrame
Columns: [Principio Attivo, Descrizione Gruppo, Denominazione e Confezione, Titolare AIC, AIC, Codice Gruppo Equivalenza, Class]
Index: []

Missing values per column:
Principio Attivo              0
Descrizione Gruppo            0
Denominazione e Confezione    0
Titolare AIC                  0
AIC                           0
Codice Gruppo Equivalenza     0
Class                         0
dtype: int64


# **Making Dataset A**

In [None]:
import pandas as pd

# Specify encoding as ISO-8859-1 (Latin-1) or Windows-1252
dfa = pd.read_csv("/content/Classe_A_per_principio_attivo_15-10-2024.csv",  delimiter=';', encoding='ISO-8859-1')  # or encoding='latin1'

# prompt: i need rows from dfa from 1800 to 2131
dfa = dfa.iloc[1798:2130]

dfa['Class'] = 'A'

columns_to_keep = ['Principio Attivo', 'Descrizione Gruppo', 'Denominazione e Confezione', 'Titolare AIC', 'AIC', 'Codice Gruppo Equivalenza', 'Class']

# Create a new DataFrame with only the specified columns
dfa = dfa[columns_to_keep]

# Check for duplicate rows
duplicates = dfa[dfa.duplicated()]
print("Duplicate rows:")
print(duplicates)

# Check for missing values (NaN) in any column
missing_values = dfa[dfa.isnull().any(axis=1)]
print("\nRows with missing values:")
print(missing_values)

# You can also check for missing values per column
print("\nMissing values per column:")
print(dfa.isnull().sum())


# prompt: add these columns to dataset in order (4.1 Indicazioni terapeutiche, 4.2 Posologia e modo di somministrazione, 4.3 Contraindications, 4.4 Special warnings and precautions for use, 4.5 Interactions with other medicinal products, 4.6 Fertility, pregnancy and lactation, 4.7 Effects on ability to drive and use machines, 4.8 Undesirable effects (side effects), 4.9 Overdose, 6.2 Incompatibilities, )

# Add new columns with default empty values
new_columns = [
    '4.1 Indicazioni terapeutiche',
    '4.2 Posologia e modo di somministrazione',
    '4.3 Contraindications',
    '4.4 Special warnings and precautions for use',
    '4.5 Interactions with other medicinal products',
    '4.6 Fertility, pregnancy and lactation',
    '4.7 Effects on ability to drive and use machines',
    '4.8 Undesirable effects (side effects)',
    '4.9 Overdose',
    '6.2 Incompatibilities'
]

for col in new_columns:
    dfa[col] = ''

dfa['URL'] = ''

dfa.to_csv('filtered_dataset_A.csv', index=False)


(332, 7)

# **Extracting PDFs**

In [None]:
import requests
!pip install pypdf
import pandas as pd
import io
import re
import csv
from pypdf import PdfReader
import unicodedata

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept": "application/json"
}

def get_codice_sis_and_atc(aic_code):
    search_url = f"https://api.aifa.gov.it/aifa-bdf-eif-be/1.0.0/formadosaggio/ricerca?query={aic_code}&spellingCorrection=true&page=0"
    try:
        response = requests.get(search_url, headers=HEADERS)
        if "application/json" not in response.headers.get("Content-Type", ""):
            print(f"Non-JSON response for AIC {aic_code}")
            return None, None
        data = response.json()
        content = data["data"]["content"][0]
        medicinale = content.get("medicinale", {})
        codice_sis = medicinale.get("codiceSis", None)
        codice_atc_list = content.get("codiceAtc", [])
        codice_atc = codice_atc_list[0] if codice_atc_list else None
        return codice_sis, codice_atc
    except Exception as e:
        print(f"Error processing AIC {aic_code}: {e}")
        return None, None

df = pd.read_csv('/content/filtered_A_H.csv')
df['AIC'] = df['AIC'].astype(str).str.zfill(9)
df[['codiceSis', 'codiceAtc']] = df['AIC'].apply(lambda x: pd.Series(get_codice_sis_and_atc(x)))
df['codiceSis'] = df['codiceSis'].fillna('Not available')
df['codiceAtc'] = df.apply(lambda row: 'Not available' if row['codiceSis'] == 'Not available' else row['codiceAtc'], axis=1)
df['AIC_trimmed'] = df['AIC'].str[:5]

def clean_codiceSis(code):
    try:
        return str(int(float(code)))
    except:
        return str(code)

df['codiceSis_clean'] = df['codiceSis'].apply(clean_codiceSis)
df['URL'] = (
    "https://api.aifa.gov.it/aifa-bdf-eif-be/1.0.0/organizzazione/"
    + df['codiceSis_clean']
    + "/farmaci/"
    + df['AIC_trimmed']
    + "/stampati?ts=RCP"
)

df['URL'] = df.apply(lambda row: 'Not available' if row['codiceSis'] == 'Not available' else row['URL'], axis=1)
df = df.drop(columns=['codiceSis_clean', 'AIC_trimmed'])
df.to_csv('filtered_dataset_A_last.csv', index=False)
print(f"Saved dataset with URLs, total rows: {len(df)}")

df = pd.read_csv('filtered_dataset_A_last.csv')

SECTION_FIRST_LETTERS = {
    "4\\.1": "i",
    "4\\.2": "p",
    "4\\.3": "c",
    "4\\.4": "a",
    "4\\.5": "i",
    "4\\.6": "f",
    "4\\.7": "e",
    "4\\.8": "e",
    "4\\.9": "s",
    "5\\.1": "p",
    "6\\.2": "i",
    "6\\.3": "p",
}

def find_header_span(full_text: str, section_number: str):
    first_letter = SECTION_FIRST_LETTERS.get(section_number.lower(), ".")
    pattern = rf"^ *{section_number} +[{first_letter}{first_letter.upper()}][^\n]*"
    regex = re.compile(pattern, re.IGNORECASE | re.MULTILINE)
    m = regex.search(full_text)
    if m:
        return m.start(), m.end()
    else:
        return -1, -1

def extract_section(full_text: str, start_section: str, end_section: str = None) -> str:
    start_span = find_header_span(full_text, start_section)
    if start_span[0] == -1:
        return None
    header_line_end = full_text.find("\n", start_span[1])
    if header_line_end == -1:
        header_line_end = start_span[1]
    slice_start = header_line_end + 1

    if end_section is None:
        return None

    end_span = find_header_span(full_text, end_section)
    if end_span[0] == -1:
        return None

    slice_end = end_span[0]
    return full_text[slice_start:slice_end]

sections = {
    "4.1 Indicazioni terapeutiche": ("4\\.1", "4\\.2"),
    "4.2 Posologia e modo di somministrazione": ("4\\.2", "4\\.3"),
    "4.3 Contraindications": ("4\\.3", "4\\.4"),
    "4.4 Special warnings and precautions for use": ("4\\.4", "4\\.5"),
    "4.5 Interactions with other medicinal products": ("4\\.5", "4\\.6"),
    "4.6 Fertility, pregnancy and lactation": ("4\\.6", "4\\.7"),
    "4.7 Effects on ability to drive and use machines": ("4\\.7", "4\\.8"),
    "4.8 Undesirable effects (side effects)": ("4\\.8", "4\\.9"),
    "4.9 Overdose": ("4\\.9", "5\\.1"),
    "6.2 Incompatibilities": ("6\\.2", "6\\.3"),
}

invalid_pdf_indices = []

for idx, row in df.iterrows():
    codiceSis_raw = row['codiceSis']
    AIC = row['AIC']

    try:
        codiceSis = str(int(float(codiceSis_raw)))
    except Exception as e:
        print(f"Skipping row {idx} due to invalid codiceSis: {codiceSis_raw} ({e})")
        invalid_pdf_indices.append(idx)
        continue

    AIC_trimmed = str(AIC)[:5]
    pdf_url = f"https://api.aifa.gov.it/aifa-bdf-eif-be/1.0.0/organizzazione/{codiceSis}/farmaci/{AIC_trimmed}/stampati?ts=RCP"
    print(f"Row {idx}: Downloading PDF from {pdf_url}")

    try:
        response = requests.get(pdf_url)
        if response.status_code != 200:
            print(f"Row {idx}: Failed to download PDF. Status code: {response.status_code}")
            invalid_pdf_indices.append(idx)
            continue
        pdf_bytes = response.content
        if not pdf_bytes:
            print(f"Row {idx}: PDF content is empty.")
            invalid_pdf_indices.append(idx)
            continue
        reader = PdfReader(io.BytesIO(pdf_bytes))
    except Exception as e:
        print(f"Row {idx}: Error downloading or parsing PDF: {e}")
        invalid_pdf_indices.append(idx)
        continue

    full_text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text = page.extract_text() or ""
        full_text += text + "\n"


    def clean_section(raw):
        if raw is None or raw.strip() == "":
            return "not available"
        cleaned = raw.replace("\n", "").replace("\uf0b7", "")
        return cleaned

    for col_name, (start_tag, end_tag) in sections.items():
        raw = extract_section(full_text, start_tag, end_tag)
        clean = clean_section(raw)
        df.loc[df['AIC'] == AIC, col_name] = clean
        print(f"Updated '{col_name}' for AIC={AIC}: {clean[:40]}...")

    df['codiceSis'] = df['codiceSis'].apply(clean_codiceSis)

if invalid_pdf_indices:
    print(f"Marking {len(invalid_pdf_indices)} rows with broken PDF links as 'not available'...")
    for idx in invalid_pdf_indices:
        for col_name in sections.keys():
            df.at[idx, col_name] = "not available"

def remove_unwanted_symbols(text):
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize("NFKD", text)
    return re.sub(r"[^a-zA-Z0-9àèéìòùçÀÈÉÌÒÙÇ.,;:!?@#\$%\^&\*\(\)\[\]\{\}\-_\+=\\\/<>\s]", '', text)

cols_to_clean = [
    "4.1 Indicazioni terapeutiche",
    "4.2 Posologia e modo di somministrazione",
    "4.3 Contraindications",
    "4.4 Special warnings and precautions for use",
    "4.5 Interactions with other medicinal products",
    "4.6 Fertility, pregnancy and lactation",
    "4.7 Effects on ability to drive and use machines",
    "4.8 Undesirable effects (side effects)",
    "4.9 Overdose",
    "6.2 Incompatibilities",
]

for col in cols_to_clean:
    df[col] = df[col].apply(remove_unwanted_symbols)


df.to_csv('Final_Dataset.csv', index=False, encoding='utf-8', sep=',', quotechar='"', quoting=csv.QUOTE_ALL)
print("Saved  extracted sections to Final_Dataset.csv")

Error processing AIC 045461011: list index out of range
Error processing AIC 025298086: list index out of range
Error processing AIC 025298050: list index out of range
Error processing AIC 025298074: list index out of range
Error processing AIC 037290018: list index out of range
Error processing AIC 025298124: list index out of range
Error processing AIC 033672027: list index out of range
Error processing AIC 046280018: list index out of range
Error processing AIC 044579011: list index out of range
Error processing AIC 044579035: list index out of range
Error processing AIC 035129030: list index out of range
Error processing AIC 043030218: list index out of range
Error processing AIC 047062068: list index out of range
Error processing AIC 047062029: list index out of range
Error processing AIC 025513045: list index out of range
Error processing AIC 039670082: list index out of range
Error processing AIC 043348010: list index out of range
Error processing AIC 043663032: list index out o

  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean
  df.loc[df['AIC'] == AIC, col_name] = clean


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Updated '4.6 Fertility, pregnancy and lactation' for AIC=45545062: Gravidanza  Non ci sono rischi noti per ...
Updated '4.7 Effects on ability to drive and use machines' for AIC=45545062:  Acido Folico EG  non ha effetto o ha un...
Updated '4.8 Undesirable effects (side effects)' for AIC=45545062:  La frequenza degli effetti indesiderati...
Updated '4.9 Overdose' for AIC=45545062:  È improbabile che siano necessarie proc...
Updated '6.2 Incompatibilities' for AIC=45545062:  Non applicabile.  ...
Row 180: Downloading PDF from https://api.aifa.gov.it/aifa-bdf-eif-be/1.0.0/organizzazione/68/farmaci/36345/stampati?ts=RCP
Updated '4.1 Indicazioni terapeutiche' for AIC=36345015:  Prevenzione primaria dei dif etti del t...
Updated '4.2 Posologia e modo di somministrazione' for AIC=36345015:  Posologia  1 compressa al giorno ininte...
Updated '4.3 Contraindications' for AIC=36345015: Ipersensibilità al principio attivo o a ...
Up