# 📁 Project: RiskGuardian AI — Real-Time Risk Detection and Compliance Monitor

In [26]:
# === 00_setup.py ===
# Environment & data pipeline setup for RiskGuardian
import os
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

# Setup folders in project root
Path("../data/raw").mkdir(parents=True, exist_ok=True)
Path("../data/processed").mkdir(parents=True, exist_ok=True)
Path("../models").mkdir(parents=True, exist_ok=True)
Path("../artifacts").mkdir(parents=True, exist_ok=True)
Path("../data/logs").mkdir(parents=True, exist_ok=True)

print("✅ Folder structure created.")


✅ Folder structure created.


In [48]:
# Example: SEC EDGAR 10-K Filing Data (Public Companies)
# === Function to Fetch and Save SEC Filing Data ===
def fetch_sec_10k_10q_from_index(cik: str, company_name: str, user_agent: str = "RiskGuardianBot/0.1"):
    BASE_URL = "https://www.sec.gov/Archives/edgar/full-index/2024/QTR1/company.idx"
    HEADERS = {"User-Agent": "RiskGuardianBot/1.0 (abdoulkarim.toure@gmail.com)"}

    response = requests.get(BASE_URL, headers=HEADERS)
    if response.status_code == 200:
        text = response.text
        lines = text.splitlines()[11:] # Skip header lines
        matches = [line for line in lines if cik in line and ("10-K" in line or "10-Q" in line)]

        if matches:
            line = matches[0]  # Get the first match
            parts = line.split()
            form_type = parts[1] # e.g., "10-K" or "10-Q"
            path = parts[-1]  # e.g., "0001018724-24-000001.txt"
            doc_url = f"https://www.sec.gov/Archives/{path}"

            html_resp = requests.get(doc_url, headers=HEADERS)
            if html_resp.status_code == 200:
                soup = BeautifulSoup(html_resp.text, "html.parser")
                filing_text = soup.get_text(separator="\n")
                with open(f"../data/raw/{company_name}_filing.txt", "w", encoding="utf-8") as f:
                    f.write(filing_text)
                print(f"✅ {company_name} {form_type} filing saved.")
            else:
                print(f"❌ Filing download failed: {html_resp.status_code}")
        else:
            print(f"⚠️ No 10-K or 10-Q found in index for {company_name}.")
            pd.DataFrame([[company_name, cik, "No 10-K/10-Q found"]],
                         columns=["company", "cik", "status"]).to_csv("../data/logs/skipped_companies.csv", mode='a', header=not os.path.exists("../data/logs/skipped_companies.csv"), index=False)
    else:
        print(f"❌ Index download failed: {response.status_code}")
        pd.DataFrame([[company_name, cik, f"Fetch error {response.status_code}"]],
                     columns=["company", "cik", "status"]).to_csv("../data/logs/skipped_companies.csv", mode='a', header=not os.path.exists("../data/logs/skipped_companies.csv"), index=False)


        

In [49]:
# === Loop over multiple companies ===
companies = {
    "Apple": "0000320193",
    "Microsoft": "0000789019",
    "Meta": "0001326801",
    "Amazon": "0001018724",
    "Google": "0001652044",
    "Rise": "0001640967",
    "Tesla": "0001318605",
    "Nvidia": "0001045810"
}

for name, cik in companies.items():
    fetch_sec_10k_10q_from_index(cik, name)

✅ Apple Inc. filing saved.
⚠️ No 10-K or 10-Q found in index for Microsoft.
✅ Meta Platforms, filing saved.
✅ Amazon COM filing saved.
✅ Google Inc. filing saved.
⚠️ No 10-K or 10-Q found in index for Rise.
⚠️ No 10-K or 10-Q found in index for Tesla.
✅ Nvidia CORP filing saved.


In [52]:
# === 01_preprocess.py ===
# Basic cleaning and prep of fetched filings
import os
import pandas as pd

for company in companies:
    path = f"../data/raw/{company}_filing.txt"
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        with open(f"../data/processed/{company}_filing_clean.txt", "w", encoding="utf-8") as f:
            for line in lines:
                f.write(line.strip() + "\n")
        print(f"✅ Cleaned filing text for {company}.")
    else:
        print(f"⚠️ Skipping {company}: No raw filing found.")

✅ Cleaned filing text for Apple.
⚠️ Skipping Microsoft: No raw filing found.
✅ Cleaned filing text for Meta.
✅ Cleaned filing text for Amazon.
✅ Cleaned filing text for Google.
⚠️ Skipping Rise: No raw filing found.
⚠️ Skipping Tesla: No raw filing found.
✅ Cleaned filing text for Nvidia.
