<a href="https://colab.research.google.com/github/Honkware/FalconFBI/blob/main/FalconFBI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install httpx beautifulsoup4 pandas nltk torch transformers auto-gptq einops

In [89]:
import logging
import os
import sys
from functools import partial
from multiprocessing import Pool

import httpx
import pandas as pd
from IPython.display import display, HTML

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

nltk.download('stopwords')

base_url = 'https://www.fbi.gov/'

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [116]:
# Scraping

def extract_data(element, default=''):
    return element.get_text(strip=True) if element else default

def get_individual_data(url):
    soup = BeautifulSoup(httpx.get(url).text, 'html.parser')
    name = soup.select_one('.documentFirstHeading').text.strip() or ''
    aliases = extract_data(soup.select_one('.wanted-person-aliases p')) or ''
    reward = extract_data(soup.select_one('.wanted-person-reward p')) or ''
    remarks = extract_data(soup.select_one('.wanted-person-remarks p')) or ''
    caution = extract_data(soup.select_one('.wanted-person-caution p')) or ''
    image_url = soup.select_one(".wanted-person-mug img")["src"] if soup.select_one(".wanted-person-mug img") else ''
    image_link = f'<a href="{base_url}/wanted/{url}" target="_blank"><img src="{image_url}" width="50px"></a>' or ''

    description_table = soup.select_one('.wanted-person-description table')
    description = {}
    if description_table:
        rows = description_table.select('tr')
        for row in rows:
            columns = row.select('td')
            if len(columns) == 2:
                column_name = extract_data(columns[0])
                column_value = extract_data(columns[1])
                if column_name in description:
                    description[column_name] += f"; {column_value}"
                else:
                    description[column_name] = column_value

    result = {
        'Name': f'<a href="{base_url}/wanted/{url}" target="_blank">{name}</a>',
        'Image': image_link,
        'Aliases': aliases,
        'Reward': reward,
        'Remarks': remarks,
        'Caution': caution
    }

    for key, value in description.items():
        if key != 'Name':
            result[key] = value

    return result


def scrape_individuals():
    soup = BeautifulSoup(httpx.get(f'{base_url}/wanted/topten').text, 'html.parser')
    individual_urls = [a['href'] if a['href'].startswith('/') else a['href'] for a in
                       soup.select('.portal-type-person .title a')]

    description_columns = set()
    individual_data = []
    scraped_urls = set()

    with Pool() as pool:
        results = []
        for url in individual_urls:
            if url not in scraped_urls:
                result = get_individual_data(url)
                description_columns.update(result.keys())
                individual_data.append(result)
                scraped_urls.add(url)

        column_order = ['Name', 'Image', 'Aliases', 'Reward', 'Remarks', 'Caution']
        description_columns -= {'Name', 'Image'}
        column_order += sorted(list(description_columns))

        seen_columns = set()
        unique_columns = []
        for col in column_order:
            if col not in seen_columns:
                seen_columns.add(col)
                unique_columns.append(col)

        df = pd.DataFrame(individual_data)
        df = df[unique_columns]
        return df


if __name__ == '__main__':
    df = scrape_individuals()
    display(HTML(df.to_html(escape=False, index=False)))




Name,Image,Aliases,Reward,Remarks,Caution,Build,Complexion,Date(s) of Birth Used,Eyes,Hair,Height,Languages,Nationality,Occupation,Place of Birth,Race,Scars and Marks,Sex,Weight
RUJA IGNATOVA,,"Dr. Ruja Ignatova, Ruja Plamenova Ignatova, Ruja P. Ignatova, ""CryptoQueen""","The FBI is offering a reward of up to $250,000 for information leading to the arrest of Ruja Ignatova.",Ignatova is believed to travel with armed guards and/or associates. Ignatova may have had plastic surgery or otherwise altered her appearance.,"Ruja Ignatova is wanted for her alleged participation in a large-scale fraud scheme. Beginning in approximately 2014, Ignatova and others are alleged to have defrauded billions of dollars from investors all over the world. Ignatova was the founder of OneCoin Ltd., a Bulgaria-based company that marketed a purported cryptocurrency. In order to execute the scheme, Ignatova allegedly made false statements and representations to individuals in order to solicit investments in OneCoin. She allegedly instructed victims to transmit investment funds to OneCoin accounts in order to purchase OneCoin packages, causing victims to send wire transfers representing these investments. Throughout the scheme, OneCoin is believed to have defrauded victims out of more than $4 billion.",,,"May 30, 1980",Brown,Dark Brown to Black,,"English, German, Bulgarian",,,Bulgaria,White,,Female,
DONALD EUGENE FIELDS II,,"Don Fields, Donald Eugene Fields Jr., Eugene Fields","The FBI is offering a reward of up to $250,000 for information leading to the arrest of Donald Eugene Fields II.","Donald Eugene Fields II was last known to reside in Franklin County, Missouri. He has family in Missouri and Kentucky. He is known to visit casinos and has traveled to Florida in the past.","Donald Eugene Fields II is wanted for the alleged sex trafficking of at least one child in Missouri between approximately 2013 and 2017. It is alleged that he did knowingly attempt to recruit, entice, harbor, transport, provide, obtain, maintain, patronize and solicit a person whom he believed was under the age of l8 years and would be caused to engage in a commercial sex act.",,,"July 9, 1964",Hazel,Brown,"6'0"" to 6'4""",,,"Tree trimmer, Former resale shop owner, Independently sold used cars",Kentucky,White,"Fields II has a scar on his chest, his groin, his left calf, on both legs and both knees. He also has a tribal print tattoo on his right shoulder.",Male,219 to 235 pounds
ARNOLDO JIMENEZ,,"Arnoldo Gimenez, Arnoldo Rochel Jimenez","The FBI is offering a reward of up to $250,000 for information leading to the arrest of Arnoldo Jimenez.","Jimenez may have fled to Durango, Mexico, specifically in the area of Santiago Papasquiaro. He may also frequent Reynosa, Tamaulipas, Mexico. He has previously resided in Chicago, Illinois.","Arnoldo Jimenez is wanted for allegedly killing his wife on May 12, 2012, the day after their wedding. He allegedly stabbed his wife to death in his black, four-door, 2006 Maserati, then allegedly dragged her body into the bathroom tub of her apartment in Burbank, Illinois. Jimenez was charged with first degree murder by the Circuit Court of Cook County, Illinois, and a state warrant was issued for his arrest on May 15, 2012. A federal arrest warrant was issued by the United States District Court, Northern District of Illinois, Eastern Division, on May 17, 2012, after Jimenez was charged federally with unlawful flight to avoid prosecution.",,,"February 19, 1982",Brown,Black,"6'0""",,American,,Texas,White (Hispanic),,Male,200 to 225 pounds
OMAR ALEXANDER CARDENAS,,,"The FBI is offering a reward of up to $250,000 for information leading to the arrest of Omar Alexander Cardenas.",Cardenas often has a beard and wears prescription glasses.,"Omar Alexander Cardenas is wanted for his alleged involvement in the murder of a man that occurred on August 15, 2019, in a large outdoor shopping center in Sylmar, California, immediately next to Los Angeles. It is alleged that he fired several rounds from a semi-automatic handgun at the victim, striking him in the head and causing his death. A local arrest warrant was issued for Cardenas on April 3, 2020, in the Superior Court of Los Angeles County after he was charged locally with murder. A federal arrest warrant from the United States District Court, Central District of California, was issued for Cardenas on September 2, 2021, after he was charged with unlawful flight to avoid prosecution.",,,"March 23, 1995",Brown,Dark Brown,"Approximately 5'6"" to 5'7""",,American,,California,White (Hispanic),,Male,Approximately 240 to 300 pounds
ALEXIS FLORES,,"Mario Flores, Mario Roberto Flores, Mario F. Roberto, Alex Contreras, Alesis Contreras","The FBI is offering a reward of up to $250,000 for information leading directly to the arrest of Alexis Flores.",Flores has ties to Honduras.,"Alexis Flores is wanted for his alleged involvement in the kidnapping and murder of a five-year-old girl in Philadelphia, Pennsylvania. The girl was reported missing in late July of 2000, and was later found strangled to death in a nearby apartment in early August of 2000.",Slim,Light,"July 18, 1975, July 18, 1982, September 15, 1980, July 17, 1982",Brown,Black,"5'4""",,Honduran,Handyman,Honduras,White (Hispanic),Flores has scars on his forehead and right cheek.,Male,130 to 140 pounds
YULAN ADONAY ARCHAGA CARIAS,,"Alexander Mendoza, Yulan Andony Archaga Carias, “Porky”","The United States Government is offering a reward of up to $5,000,000 for information leading to the arrest and/or conviction of Yulan Adonay Archaga Carias.",Archaga Carias is believed to only speak Spanish.,"Yulan Adonay Archaga Carias is charged federally in the Southern District of New York with racketeering conspiracy, cocaine importation conspiracy, and possession and conspiracy to possess machine guns. As the alleged leader of MS-13 for all of Honduras, Archaga Carias allegedly controlled MS-13 criminal activity in Honduras and provided support and resources to the MS-13 enterprise in Central America and the United States with firearms, narcotics, and cash. Archaga Carias is also allegedly responsible for supporting multi-ton loads of cocaine through Honduras to the United States and for ordering and participating in murders of rival gang members and others associated with MS-13. This case is being investigated as part of Joint Task Force Vulcan.",,,"February 13, 1982, January 21, 1982",Brown,Black,"5'5""",,Honduran,,"San Pedro Sula, Cortes, Honduras",White (Hispanic),,Male,160 pounds
BHADRESHKUMAR CHETANBHAI PATEL,,Bhadreshkumar C. Patel,"The FBI is offering a reward of up to $250,000 for information leading to the arrest of Bhadreshkumar Chetanbhai Patel.","Patel was last known to be in the Newark, New Jersey, area.","Bhadreshkumar Chetanbhai Patel is wanted for allegedly killing his wife by striking her multiple times with an object while they were both working at a donut shop in Hanover, Maryland, on April 12, 2015. A local arrest warrant was issued in the District Court of Maryland for Anne Arundel County on April 13, 2015, and Patel was charged with first degree murder, second degree murder, first degree assault, second degree assault, and dangerous weapon with intent to injure. A federal arrest warrant was issued in the United States District Court, District of Maryland, Baltimore, Maryland, on April 20, 2015, after Patel was charged with unlawful flight to avoid prosecution.",,,"May 15, 1990",Brown,Brown,"5'9""",,Indian,Employee of donut shop,"Kantrodi Ta Viramgam, Gujarat, India",,,Male,165 pounds
WILVER VILLEGAS-PALOMINO,,"Carlos El Puerco, El Puerco, Wilver Villegas, Wilver Palomino",The United States Department of State's Narcotics Rewards Program is offering a reward of up to $5 million for information leading to the arrest and/or conviction of Wilver Villegas-Palomino.,,"Wilver Villegas-Palomino is a ranking member of the National Liberation Army (ELN) and is wanted for drug-trafficking activities for the ELN Northeastern War Front in the Catatumbo region of Colombia and in Venezuela. A federal arrest warrant was issued for Villegas-Palomino in the United States District Court, Southern District of Texas, Houston Division, on February 13, 2020, after he was charged with narco-terrorism, international cocaine distribution conspiracy, and international cocaine distribution.",,,"October 21, 1981",Brown,Black,"Approximately 5'7"" to 5'9""",Spanish,Colombian,,"Curumani, Colombia",White (Hispanic),,Male,Approximately 190 pounds
ALEJANDRO ROSALES CASTILLO,,"Alexandro Castillo, Alex Castillo, Alejandro Rosales, Alejandro Castillo, Alejandro Rosales-Castillo, Alejandro Rosalescastillo","The FBI is offering a reward of up to $250,000 for information leading directly to the arrest of Alejandro Rosales Castillo.","Castillo's last known residence was in Charlotte, North Carolina. He has ties to Phoenix, Arizona. Castillo was seen crossing into Mexico and may reside in San Francisco de los Romo, Aguascalientes, or Pabellón de Arteaga, Aguascalientes. He also may have traveled to the Mexican states of Guanajuato or Veracruz.","Alejandro Rosales Castillo is wanted for his alleged involvement in the murder of a co-worker in Charlotte, North Carolina, in 2016. The female victim’s vehicle was located at a bus station in Phoenix, Arizona, on August 15, 2016. On August 17, 2016, the victim’s body was located in a wooded area in Cabarrus County, North Carolina, with a gunshot wound to the head.",,,"November 26, 1998",Brown,Black,"5'6""","English, Spanish",American,,Arizona,White (Hispanic),,Male,Approximately 180 to 190 pounds
JOSE RODOLFO VILLARREAL-HERNANDEZ,,"""El Gato""",The United States Department of State’s Transnational Organized Crime Rewards Program is offering a reward of up to $1 million for information leading directly to the arrest of Jose Rodolfo Villarreal-Hernandez.,"Jose Rodolfo Villarreal-Hernandez has ties to or may visit Monterrey, Mexico and Mexico City, Mexico.","Jose Rodolfo Villarreal-Hernandez, also known as “El Gato,” is wanted for his alleged involvement in the interstate stalking and conspiracy to commit murder-for-hire of a 43-year-old male victim on May 22, 2013, in Southlake, Texas. A federal arrest warrant for these charges was issued on June 20, 2018. Villarreal-Hernandez allegedly holds an active leadership position in the Beltran Leyva drug-trafficking organization within the region of San Pedro Garza Garcia, Nuevo Leon, Mexico.",,,"January 16, 1978",Brown,Black,"5'4""",Spanish,Mexican,,Mexico,White (Hispanic),,Male,165 pounds


In [115]:
# Generation

stopwords_set = set(stopwords.words('english'))
stemmer = PorterStemmer()

nltk_logger = logging.getLogger('nltk')
nltk_logger.setLevel(logging.ERROR)

quantized_model_dir = "TheBloke/WizardLM-Uncensored-Falcon-7B-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False, use_safetensors=True, torch_dtype=torch.float32, trust_remote_code=True)

def preprocess_field(field):
    if isinstance(field, str):
        return ' '.join([stemmer.stem(word.lower()) for word in field.split() if word.lower() not in stopwords_set])
    else:
        return ''

def generate_report(tokenizer, model, row):
    aliases = row.Aliases
    if aliases:
        person = f"{aliases} ({row.Name})"
    else:
        person = row.Name

    text_inputs = f"Suspect Info:{person}\n"
    for field in ['Name', 'Image', 'Aliases', 'Reward', 'Remarks', 'Caution', 'Build', 'Complexion',
                  'Date(s) of Birth Used', 'Eyes', 'Hair', 'Height', 'Languages', 'Nationality',
                  'Occupation', 'Place of Birth', 'Race', 'Scars and Marks', 'Sex', 'Weight']:
        value = row[field]
        if value:
            text_inputs += f"{field}: {preprocess_field(value)}\n"

    text_inputs += "Comprehensive Report:"
    
    encoding = tokenizer(
        text_inputs,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to("cuda:0")
    attention_mask = encoding["attention_mask"].to("cuda:0")
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=500,
            do_sample=True,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id
        )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_start = "Comprehensive Report:"
    generated_text = generated_text[generated_text.index(summary_start) + len(summary_start):].strip()

    report_html = f"""
    <div style="border: 1px solid black; padding: 10px; margin-bottom: 20px;">
        <div>
            <a href="{base_url}/most-wanted/{aliases.lower().replace(' ', '-')}" target="_blank">
            {row.Image}
            </a>
            <h3>{person}</h3>
        </div>
        <p><strong>Generated Report:</strong></p>
        <p>{generated_text}</p>
        <div style="clear: both;"></div>
    </div>
    """

    return report_html, text_inputs

def generate_reports(df, batch_size=4):
    logging.getLogger().setLevel(logging.WARNING)
    html_reports = []
    text_reports = []

    with tqdm(total=len(df), desc="Generating Reports", bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}') as pbar:
        for i in range(0, len(df), batch_size):
            batch_df = df[i:i+batch_size]
            batch_html_reports = []
            batch_text_reports = []
            
            for _, row in batch_df.iterrows():
                report_html, report_text = generate_report(tokenizer, model, row)
                batch_html_reports.append(report_html)
                batch_text_reports.append(report_text)
                display(HTML(report_html))
                pbar.update(1)

            html_reports.extend(batch_html_reports)
            text_reports.extend(batch_text_reports)

    return html_reports, text_reports

html_reports, text_reports = generate_reports(df)




Generating Reports:   0%|          | 0/10