<a href="https://colab.research.google.com/github/IdaWiweka/datathon/blob/IdaWiweka/Datathon_DataProcessing_2_extraction_title.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from huggingface_hub import login
from google.colab import userdata
huggingface_token = userdata.get('huggingface_llama32')

# Login using the token
login(huggingface_token)


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
import os
import pandas as pd
import json
import torch
from transformers import pipeline
from tqdm import tqdm


# Define the folder path
folder_path = '/content/drive/My Drive/processed_batches/Dataset_id'
file_path = '/content/drive/My Drive/processed_batches/Dataset_id/title_extraction_1.csv'

# Read the CSV file
file_name = "epiwatch-latest.csv"

data = pd.read_csv(os.path.join(folder_path, file_name))

# Filter the data to include only rows with id from 1 to 50
filtered_data = data[data['report_id'].between(1, 20)]

# Set the device
device = 0 if torch.cuda.is_available() else -1  # 0 for CUDA, -1 for CPU

# Parameters
parameters = {
    "model_name": "meta-llama/Llama-3.2-3B-Instruct",
    "device": device,
    "torch_dtype": torch.bfloat16,
    "temperature": 0.2,
    "top_p": 0.9,
    "max_new_tokens": 128,
    "text_column": "title",  # Replace with the actual column name in your CSV
    "id_column": "report_id"        # Replace with the actual column name in your CSV
}

def extract_information_from_dataframe(
    dataframe,
    model_name,
    device,
    torch_dtype,
    temperature,
    top_p,
    max_new_tokens,
    text_column,
    id_column
):
    """
    Extract structured information from a DataFrame using a language model pipeline.
    """
    # Set the device
    # Initialize the pipeline
    generator = pipeline("text-generation", model=model_name, device=device, torch_dtype=torch_dtype)

    # Prepare results list
    results = []

    # Iterate through the DataFrame rows
    for _, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Processing rows"):
        data = row["title"]
        unique_id = row["report_id"]

        # Prepare the prompt
        prompt = [
                {
                    "role": "system",
                    "content": """You are an epidemiologist. Your task is to extract information from unstructured text data.\n

                                  Specifically, you will identify and extract:\n
                                  - The number of cases (incidence or prevalence), if none = NA.\n
                                  - The number of mortality (dead or kill), if none = NA.\n
                                  - The name of the disease(s), if none = NA.\n
                                  - The name of the syndrome(s), if none = NA.\n
                                  - The location(s) where the case(s) occurred, if none = NA.\n
                                  - Determine if it is an "outbreak", "increase", "decrease", "decline", "new", "rise", or "alert" or any synonym of those words, if none = NA.\n

                                  There are seven example of diseases:\n
                                  Influenza (many strains), Covid-19, Mpox, Legionnaires', Dengue, Measles, Cholera\n
                                  However, it also potentially any other diseases outside the list.\n

                                  The syndromes refer to more generalised symptoms, usually recorded when the disease is unknown. Common syndromes include\n
                                  Acute gastroenteritis, Severe acute respiratory syndrome, Febrile syndromes, Pneumonia, Influenza-like illness.\n


                                  Example 1:\n
                                  The data: "271 new cases of corona virus infection in Odisha, two patients killed - Navbharat Times"\n
                                  The Output:\n
                                  "cases": "271",\n
                                  "dead": "2",\n
                                  "disease": "corona virus infection",\n
                                  "syndrome": "NA",\n
                                  "location": "Odisha",\n
                                  "keywords": "new"\n

                                  Example 2:\n
                                  The data: "Alert in Europe for new cases of avian influenza | Agrofy News"\n
                                  The Output:\n
                                  "cases": "NA",\n
                                  "dead": "NA",\n
                                  "disease": "avian influenza",\n
                                  "syndrome": "NA",\n
                                  "locations": "Europe",\n
                                  "transmission": "NA",\n
                                  "keywords": "new, alert"\n

                                  Example 3:\n
                                  The data: "CIDRAP - India's COVID-19 total tops 5 million as cases rise in Europe"\n
                                  The Output:\n
                                  "cases": "5000000",\n
                                  "dead": "NA",\n
                                  "disease": "COVID-19",\n
                                  "syndrome": "NA",\n
                                  "locations": "Europe",\n
                                  "keywords": "rise"\n

                                  Example 4:\n
                                  The data: "New Crown Pneumonia in the United States confirmed 6.59 million deaths over 195,000;"\n
                                  The Output:\n
                                  "cases": "6590000",\n
                                  "dead": "195000",\n
                                  "disease": "NA",\n
                                  "syndrome": "Pneumonia",\n
                                  "locations": "United States",\n
                                  "keywords": "new"\n

                                  Example 5:\n
                                  The data: "Nearly half a million children in the United States have been infected with the new coronavirus;\n"
                                  The Output:\n
                                  "cases": "500000",\n
                                  "dead": "NA",\n
                                  "disease": "coronavirus",\n
                                  "syndrome": "NA",\n
                                  "locations": "United States",\n
                                  "keywords": "new"\n

                                  Example 6:\n
                                  The data: "US - Approx. a daily increase of 46,425 cases, 1,076 deaths for totals of 6,649,458 cases,Â 197,223 deathsÂ - Fauci says enough people have to take a Covid-19 vaccine to be efficient - September 17, 2020;\n"
                                  The Output:\n
                                  "cases": "46425",\n
                                  "dead": "1076",\n
                                  "disease": "Covid-19",\n
                                  "syndrome": "NA",\n
                                  "locations": "US",\n
                                  "keywords": "daily increase"\n

                                  Example 7:\n
                                  The data: "Also the scourge of infectious diseases: 38 people suffering from fever in 15 days ...;\n"
                                  The Output:\n
                                  "cases": "38",\n
                                  "dead": "NA",\n
                                  "disease": "NA",\n
                                  "syndrome": "fever",\n
                                  "locations": "NA",\n
                                  "keywords": "daily increase"\n

                                  Return your response in JSON format without any additional explanation.\n""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the data:\n
                                  "{data}"
                                  Extract the required information.\n""",
                },
            ]

        # Generate the response
        generation = generator(
            prompt,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens
        )

        # Extract the JSON-like respons
        assistant_response = generation[0]["generated_text"]
        assistant_content = assistant_response[2]['content']
        # Add results to the list
        results.append({
            id_column: unique_id,
            "extracted_info": assistant_content
        })

    return results

# Example usage
results = extract_information_from_dataframe(
    dataframe=filtered_data,
    **parameters
)

# Print the results in a readable format
print(json.dumps(results, indent=4))



def parse_extracted_info(info):
    try:
        # Remove backticks and strip any extra spaces
        cleaned_info = info.strip("```").strip()
        # Convert the string to a dictionary
        return json.loads(cleaned_info)
    except json.JSONDecodeError:
        return {}

# Create a DataFrame
df = pd.DataFrame(results)

# Parse the 'extracted_info' column into dictionaries
df['parsed_info'] = df['extracted_info'].apply(parse_extracted_info)

# Expand the parsed_info dictionary into separate columns
parsed_df = pd.json_normalize(df['parsed_info'])

# Combine the parsed information with the original ID column
final_df = pd.concat([df[['report_id']], parsed_df], axis=1)

print(final_df)

# Convert 'report-id' column in df1 to string (object) type
data['report_id'] = data['report_id'].astype(int)

# Merge the two DataFrames on 'report-id'
merged_df = pd.merge(data, final_df, on='report_id', how='inner')

print(merged_df)

# Save and download the file
#merged_df.to_csv('extracted.csv', index=False)
#files.download('extracted.csv')
# Save the DataFrame to a CSV file
merged_df.to_csv(file_path, index=False)

print(f"File saved to {file_path}")
print("DataFrame has been saved to 'csv'")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/20 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:   5%|▌         | 1/20 [00:01<00:26,  1.42s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:  10%|█         | 2/20 [00:02<00:26,  1.47s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:  15%|█▌        | 3/20 [00:04<00:25,  1.49s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:  20%|██        | 4/20 [00:06<00:24,  1.52s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:  25%|██▌       | 5/20 [00:07<00:22,  1.51s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:  30%|███       | 6/20 [00:09<00:21,  1.53s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:  35%|███▌      | 7/20 [00:10<00:18,  1.45s/it]Setting

[
    {
        "report_id": 1,
        "extracted_info": "{\"cases\": \"90123\", \"dead\": \"NA\", \"disease\": \"NA\", \"syndrome\": \"NA\", \"location\": \"NA\", \"keywords\": \"NA\"}"
    },
    {
        "report_id": 2,
        "extracted_info": "{\"cases\": \"NA\", \"dead\": \"NA\", \"disease\": \"NA\", \"syndrome\": \"NA\", \"location\": \"Ludhiana District\", \"keywords\": \"NA\"}"
    },
    {
        "report_id": 3,
        "extracted_info": "{\"cases\": \"5000000\", \"dead\": \"NA\", \"disease\": \"COVID-19\", \"syndrome\": \"NA\", \"locations\": \"Europe\", \"keywords\": \"rise\"}"
    },
    {
        "report_id": 4,
        "extracted_info": "{\"cases\": \"426\", \"dead\": \"26\", \"disease\": \"NA\", \"syndrome\": \"NA\", \"location\": \"LOUISIANA\", \"keywords\": \"new\"}"
    },
    {
        "report_id": 5,
        "extracted_info": "{\"cases\": \"498\", \"dead\": \"9\", \"disease\": \"COVID-19\", \"syndrome\": \"NA\", \"location\": \"New Jersey\", \"keywords\": \"new


