<a href="https://colab.research.google.com/github/Honkware/FalconFBI/blob/main/FalconFBI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install httpx beautifulsoup4 pandas nltk torch transformers auto-gptq einops

In [35]:
import logging
import os
import sys
from functools import partial
from multiprocessing import Pool

import httpx
import pandas as pd
from IPython.display import display, HTML

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

nltk.download('stopwords')

base_url = 'https://www.fbi.gov/'


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# Scraping

def get_individual_data(url):
    try:
        if url.startswith('/'):
            url = base_url + url
        soup = BeautifulSoup(httpx.get(url).text, 'html.parser')
        name = soup.select_one('.documentFirstHeading').text.split('\n')[0].strip()
        aliases = (
            soup.select_one('.wanted-person-aliases p').text.strip()
            if (aliases := soup.select_one('.wanted-person-aliases'))
            else ''
        )
        reward = (
            soup.select_one('.wanted-person-reward p').text.strip()
            if (reward := soup.select_one('.wanted-person-reward'))
            else ''
        )
        remarks = (
            soup.select_one('.wanted-person-remarks p').text.strip()
            if (remarks := soup.select_one('.wanted-person-remarks'))
            else ''
        )
        caution = (
            soup.select_one('.wanted-person-caution p').text.strip()
            if (caution := soup.select_one('.wanted-person-caution'))
            else ''
        )
        image_url = soup.select_one(".wanted-person-mug img")["src"]
        image_link = f'<a href="{url}" target="_blank"><img src="{image_url}" width="50px"></a>'

        return {
            'Name': f'<a href="{url}" target="_blank">{name}</a>',
            'Image': image_link,
            'Aliases': aliases,
            'Reward': reward,
            'Remarks': remarks,
            'Caution': caution,
        }
    except Exception as e:
        logging.error(f"Error occurred while scraping individual data: {str(e)}")
        return {
            'Name': '',
            'Image': '',
            'Aliases': '',
            'Reward': '',
            'Remarks': '',
            'Caution': '',
        }


def scrape_individuals():
    try:
        soup = BeautifulSoup(httpx.get(f'{base_url}/wanted/topten').text, 'html.parser')
        individual_urls = [a['href'] if a['href'].startswith('/') else a['href'] for a in soup.select('.portal-type-person .title a')]

        with Pool() as pool:
            return list(pool.map(get_individual_data, individual_urls))
    except Exception as e:
        logging.error(f"Error occurred while scraping individuals: {str(e)}")
        return []


df = pd.DataFrame(scrape_individuals())
df = df[['Name', 'Image', 'Aliases', 'Reward', 'Remarks', 'Caution']]


In [40]:
# Generation

nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))
stemmer = PorterStemmer()

nltk_logger = logging.getLogger('nltk')
nltk_logger.setLevel(logging.ERROR)

quantized_model_dir = "TheBloke/WizardLM-Uncensored-Falcon-7B-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=False)
model = AutoGPTQForCausalLM.from_quantized(
    quantized_model_dir,
    device="cuda:0",
    use_triton=False,
    use_safetensors=True,
    torch_dtype=torch.float32,
    trust_remote_code=True
)


def preprocess_row(row):
    remarks = row['Remarks']
    aliases = row['Aliases']
    reward = row['Reward']
    caution = row['Caution']

    filtered_words = [stemmer.stem(word.lower()) for word in remarks.split() if word.lower() not in stopwords_set]
    filtered_remarks = ' '.join(filtered_words)

    filtered_aliases = ' '.join([word for word in aliases.split() if word.lower() not in stopwords_set])

    filtered_words = [stemmer.stem(word.lower()) for word in reward.split() if word.lower() not in stopwords_set]
    filtered_reward = ' '.join(filtered_words)

    filtered_words = [stemmer.stem(word.lower()) for word in caution.split() if word.lower() not in stopwords_set]
    filtered_caution = ' '.join(filtered_words)

    return filtered_remarks, filtered_aliases, filtered_reward, filtered_caution, row['Image']


def generate_reports(df):
    logging.getLogger().setLevel(logging.WARNING)

    preprocessed_rows = [preprocess_row(row) for _, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing")]

    reports = []
    with tqdm(total=len(df), desc="Generating Reports", bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}') as pbar:
        for preprocessed_row, row in zip(preprocessed_rows, df.itertuples()):
            report_html = generate_report(tokenizer, model, preprocessed_row, row)
            reports.append(report_html)
            display(HTML(report_html))
            pbar.update(1)

    return reports

def generate_report(tokenizer, model, preprocessed_row, row):
    remarks, aliases, reward, caution, image_url = preprocessed_row

    # Construct the person string
    if aliases:
        person = f"{aliases} ({row.Name})"
    else:
        person = row.Name

    # Handle missing fields
    reward = reward if reward else ''
    remarks = remarks if remarks else ''
    caution = caution if caution else ''

    text_inputs = f"## SUSPECT INFO:{person}\n{remarks}\n{reward}\n{caution}\n## SUMMARY:"
    encoding = tokenizer.encode_plus(
        text_inputs,
        max_length=512,
        truncation=True,
        padding="longest",
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to("cuda:0")
    attention_mask = encoding["attention_mask"].to("cuda:0")
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            eos_token_id=tokenizer.eos_token_id,
            max_length=512,
            do_sample=True,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id
        )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_start = "## SUMMARY:"
    generated_text = generated_text[generated_text.index(summary_start) + len(summary_start):].strip()

    report_html = f"""
    <div style="border: 1px solid black; padding: 10px; margin-bottom: 20px;">
        <div>
            <a href="{base_url}/most-wanted/{aliases.lower().replace(' ', '-')}" target="_blank">
            {image_url}
            </a>
            <h3>{person}</h3>
        </div>
        <p><strong>Generated Report:</strong></p>
        <p>{generated_text}</p>
        <div style="clear: both;"></div>
    </div>
    """

    return report_html

reports = generate_reports(df)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing:   0%|          | 0/10 [00:00<?, ?it/s]

Generating Reports:   0%|          | 0/10