# Data Analysis with LLM
This is an attempt to use an LLM to analyze text data from the Toxic Sites Identification Program (TSIP). The data is a collection of documents that describe the background of the site and its health effects on the surrounding population. The goal is to use the LLM to identify the most important words in the text data and to use these words to compare site 1888 to other sites of the same key pollutant.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from typing import Optional, Union
import json

### Data Collection

In [24]:
def get_data_from(site_id: int) -> Optional[dict]:
    url = f"https://www.contaminatedsites.org/api/v1/site/{site_id}"
    response = requests.get(url)
    if response.status_code == 200:
        response_dict = response.json()['response']
        return dict(response_dict)
    print(f"Request failed with status code: {response.status_code} | Site id: {site_id}")
    return None

def return_lst_of_sites(file_path: str) -> list:
    lst_of_sites = []
    with open(file_path, 'r') as file:
        for line in file:
            lst_of_sites.append(int(line.strip()))
    return lst_of_sites

def get_site_data_from(file_path: str) -> pd.DataFrame:
    with open(file_path, 'r') as file:
        return pd.DataFrame(json.load(file))

def compile_text(dataframe: pd.DataFrame) -> str:
    compiled_text = ''
    for site in dataframe.to_dict('records'):
        compiled_text += site['name'] + '\n' + site['description'] + '\n' + site['health_impact'] + '\n\n'
    return compiled_text

def store_data(data: Union[dict, list[dict]], file_path: str):
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

In [11]:
# get data for site 1888
site_1888 = get_data_from(1888)
store_data(site_1888, 'data/site_1888.json')

int

In [15]:
# get list of sites from data/sites_of_interest/high_bsi.txt
sites_high_bsi = return_lst_of_sites('data/sites_of_interest/high_bsi.txt')
sites_data = []
for site in sites_high_bsi:
    site_data = get_data_from(site)
    sites_data.append(site_data)
store_data(sites_data, 'data/tsip_deep_data.json')

In [29]:
sites_kp_12_high_bsi = return_lst_of_sites('data/sites_of_interest/kp_12_high_bsi.txt')
sites_data_kp_12 = []
for site in sites_kp_12_high_bsi:
    site_data = get_data_from(site)
    sites_data_kp_12.append(site_data)
store_data(sites_data_kp_12, 'data/tsip_deep_data_kp_12.json')

Storing the health effects data in a text file for each kp_12 site.

In [22]:
path = 'data/tsip_deep_data_kp_12.json'
df = get_site_data_from(path)
for site in df.to_dict('records'):
    site_id = site['id']
    title = 'Site Title: ' + site['name']
    description = 'Site Description: ' + site['description']
    health_effects = 'Health Impacts: ' + site['health_impact']
    with open(f'data/text/health_impact/raw/{site_id}.txt', 'w') as text_file:
        text_file.write(title + '\n' + description + '\n' + health_effects)

Compile all the health effects data into one text file.

In [25]:
with open('data/text/health_impact/compiled.txt', 'w') as compiled_file:
    compiled_file.write(compile_text(df))

### Data Preprocessing

# Conclusion