<a href="https://colab.research.google.com/github/IdaWiweka/datathon/blob/IdaWiweka/Datathon_DataProcessing_3_extraction_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from huggingface_hub import login
from google.colab import userdata
huggingface_token = userdata.get('huggingface_llama32')

# Login using the token
login(huggingface_token)


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import os
import csv
import json
import torch
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bars
from google.colab import files
from transformers import pipeline


# Define the folder path
folder_path = '/content/drive/My Drive/processed_batches/Dataset_id'
file_path = '/content/drive/My Drive/processed_batches/Dataset_id/content_extraction_1.csv'

# Read the CSV file
file_name = "epiwatch-latest.csv"

data = pd.read_csv(os.path.join(folder_path, file_name))
# Add a new column with the row index numbers (if needed)
#data['id'] = range(1, len(data) + 1)
data = data.head(10)
# Set the device
device = 0 if torch.cuda.is_available() else -1  # 0 for CUDA, -1 for CPU

# Parameters
parameters = {
    "model_name": "meta-llama/Llama-3.2-3B-Instruct",
    "device": device,
    "torch_dtype": torch.bfloat16,
    "temperature": 0.1,
    "top_p": 0.9,
    "max_new_tokens": 128000,
    "text_column": "Translated_Content",  # Replace with the actual column name in your CSV
    "id_column": "report_id"        # Replace with the actual column name in your CSV
}

def extract_information_from_dataframe(
    dataframe,
    model_name,
    device,
    torch_dtype,
    temperature,
    top_p,
    max_new_tokens,
    text_column,
    id_column
):
    """
    Extract structured information from a DataFrame using a language model pipeline.
    """
    # Set the device
    # Initialize the pipeline
    generator = pipeline("text-generation", model=model_name, device=device, torch_dtype=torch_dtype)

    # Prepare results list
    results = []

    # Iterate through the DataFrame rows
    for _, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Processing rows"):
        data = row["Translated_Content"]
        unique_id = row["report_id"]

        # Prepare the prompt
        prompt = [
                {
                    "role": "system",
                    "content": """You are an epidemiologist. Your task is to summary information from unstructured text data.\n

                                  Specifically, you will identify and summary:\n
                                  - The number of new case(s). Summary the number of new case(s) from the context. Determine wether the number of the case(s) is represent of the new case(s) from the context. if none = NA.\n
                                  - The total of cases. Summary the total cases from the context. Determine wether the number of the total cases is represent of the total cases from the context. if none = NA.\n
                                  - The number of mortality (dead or kill) cases. Extract the number of dead or kill cases from the related disease. Do not extract any non dead/kill cases if there is no dead cases. if none = NA.\n
                                  - The name of the disease(s), if none = NA.\n
                                  - The name of the syndrome(s), if none = NA.\n
                                  - Determine the type of transmission such as "human to human" or "animal to human" or "animal to animal", if none = NA.
                                  - The location(s) where the case(s) occurred, if none = NA.\n
                                  - Determine if it is an "outbreak", "increase", "decrease", "decline", "new", "rise", or "alert" or any synonym of those words, if none = NA.\n

                                  There are seven example of diseases:\n
                                  Influenza (many strains), Covid-19, Mpox, Legionnaires', Dengue, Measles, Cholera. However, it also potentially any other diseases outside the list.\n

                                  The syndromes refer to more generalised symptoms, usually recorded when the disease is unknown. Common syndromes include: Acute gastroenteritis, Severe acute respiratory syndrome, Febrile syndromes, Pneumonia, Influenza-like illness.\n


                                  The Output:\n
                                  "new case(s)": ,\n
                                  "total cases": ,\n
                                  "mortality": ,\n
                                  "disease": ,\n
                                  "syndrome": ,\n
                                  "type of transimission": ,\n
                                  "location": ,\n
                                  "keywords": ,\n
                                  "other cases/disease:" \n


                                  Return your response in JSON format without any additional explanation.""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the data:
                                  "{data}"
                                  summary the required information.""",
                },
            ]

        # Generate the response
        generation = generator(
            prompt,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens
        )

        # Extract the JSON-like respons
        assistant_response = generation[0]["generated_text"]
        assistant_content = assistant_response[2]['content']
        # Add results to the list
        results.append({
            id_column: unique_id,
            "extracted_info": assistant_content
        })

    return results

# Example usage
results = extract_information_from_dataframe(
    dataframe=data,
    **parameters
)

# Print the results in a readable format
print(json.dumps(results, indent=4))



def parse_extracted_info(info):
    try:
        # Remove backticks and strip any extra spaces
        cleaned_info = info.strip("```").strip()
        # Convert the string to a dictionary
        return json.loads(cleaned_info)
    except json.JSONDecodeError:
        return {}

# Create a DataFrame
df = pd.DataFrame(results)

# Parse the 'extracted_info' column into dictionaries
df['parsed_info'] = df['extracted_info'].apply(parse_extracted_info)

# Expand the parsed_info dictionary into separate columns
parsed_df = pd.json_normalize(df['parsed_info'])

# Combine the parsed information with the original ID column
final_df = pd.concat([df[['report_id']], parsed_df], axis=1)

print(final_df)

# Convert 'report-id' column in df1 to string (object) type
data['report_id'] = data['report_id'].astype(int)

# Merge the two DataFrames on 'report-id'
merged_df = pd.merge(data, final_df, on='report_id', how='inner')

print(merged_df)

# Save and download the file
merged_df.to_csv('extracted.csv', index=False)
files.download('extracted.csv')
# Save the DataFrame to a CSV file
merged_df.to_csv(file_path, index=False)

print(f"File saved to {file_path}")
print("DataFrame has been saved to 'csv'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Processing rows:   0%|          | 0/10 [00:00<?, ?it/s]


KeyError: 'Translated_Content'