In [80]:
import pandas as pd
import numpy as np
import json
import requests # For simulating API calls
from bs4 import BeautifulSoup # For simulating HTML table extraction
import datetime
import random
import string

In [81]:
# --- Configuration for Data Generation ---
NUM_CUSTOMERS = 12820 # Number of dummy customer records
NUM_TRANSACTIONS = 500000 # Number of dummy transaction records

In [82]:
# Configuration for loading common names from Excel
COMMON_NAMES_FILE = 'Name_list.xlsx'
COMMON_NAMES_SHEET = 'Names' # Assuming names are in a sheet named 'Names'

# List of common full names (will be loaded from Excel or use default if file not found)
# This list will be populated by the load_common_names_from_excel function
COMMON_FULL_NAMES = []

In [83]:
# List of countries and their simulated risk levels
# In a real scenario, these would come from external geopolitical risk data providers
COUNTRY_RISK_MAP = {
    'IRAN': 'HIGH', 'NORTH KOREA': 'HIGH', 'SYRIA': 'HIGH', 'CUBA': 'HIGH', 'VENEZUELA': 'HIGH',
    'RUSSIA': 'MEDIUM', 'CHINA': 'MEDIUM', 'INDIA': 'LOW', 'USA': 'LOW', 'UK': 'LOW',
    'GERMANY': 'LOW', 'FRANCE': 'LOW', 'BRAZIL': 'MEDIUM', 'SOUTH AFRICA': 'MEDIUM',
    'NIGERIA': 'MEDIUM', 'AFGHANISTAN': 'HIGH', 'YEMEN': 'HIGH', 'SOMALIA': 'HIGH',
    'LEBANON': 'MEDIUM', 'PAKISTAN': 'MEDIUM'
}
HIGH_RISK_COUNTRIES = [country for country, risk in COUNTRY_RISK_MAP.items() if risk == 'HIGH']
MEDIUM_RISK_COUNTRIES = [country for country, risk in COUNTRY_RISK_MAP.items() if risk == 'MEDIUM']
LOW_RISK_COUNTRIES = [country for country, risk in COUNTRY_RISK_MAP.items() if risk == 'LOW']

In [84]:
# --- Helper function to load common names from Excel ---
# Added 'sheet_name' parameter back to the function definition
def load_common_names_from_excel(filepath, sheet_name):
    """
    Loads a list of full names from an Excel file.
    Assumes the names are in a column named 'Sanctioned_name' in the specified sheet.
    """
    try:
        # Read the Excel file, specifying the sheet name
        names_df = pd.read_excel(filepath, sheet_name=sheet_name)
        # Changed expected column name to 'Sanctioned_name' as per your request
        if 'Sanctioned_name' in names_df.columns:
            print(f"Successfully loaded common names from '{filepath}' sheet '{sheet_name}'.")
            return names_df['Sanctioned_name'].astype(str).tolist()
        else:
            # Corrected error message to use the passed sheet_name
            print(f"Error: 'Sanctioned_name' column not found in '{sheet_name}' of '{filepath}'.")
            return []
    except FileNotFoundError:
        print(f"Warning: Common names file '{filepath}' not found. Using default hardcoded names.")
        return [
            'John Smith', 'Jane Johnson', 'Michael Williams', 'Emily Brown', 'David Jones',
            'Sarah Garcia', 'Chris Miller', 'Anna Davis', 'Robert Rodriguez', 'Maria Martinez',
            'William Taylor', 'Olivia Wilson', 'James Moore', 'Sophia White', 'Benjamin Green',
            'Isabella Hall', 'Lucas King', 'Mia Wright', 'Henry Lopez', 'Charlotte Hill'
        ]
    except Exception as e:
        print(f"An error occurred while loading common names from Excel: {e}")
        return [
            'John Smith', 'Jane Johnson', 'Michael Williams', 'Emily Brown', 'David Jones',
            'Sarah Garcia', 'Chris Miller', 'Anna Davis', 'Robert Rodriguez', 'Maria Martinez',
            'William Taylor', 'Olivia Wilson', 'James Moore', 'Sophia White', 'Benjamin Green',
            'Isabella Hall', 'Lucas King', 'Mia Wright', 'Henry Lopez', 'Charlotte Hill'
        ]

# Load common names at the start
# This line is now correct and uses the string variables defined above
COMMON_FULL_NAMES = load_common_names_from_excel(COMMON_NAMES_FILE, COMMON_NAMES_SHEET)
if not COMMON_FULL_NAMES:
    print("FATAL: COMMON_FULL_NAMES list is empty after attempting to load from Excel and using default. Exiting.")
    exit() # Exit if no names are available



Successfully loaded common names from 'Name_list.xlsx' sheet 'Names'.


In [85]:
# --- Step 1: Sanctions Data (Using your provided UK Sanctions List CSV) ---

print("--- Step 1: Loading Sanctions Data from UK Sanctions List_mean.csv ---")

sanctions_csv_path = 'C:/Users/hp952/UK Sanctions List_mean.csv'

try:
    # Try reading with a compatible encoding
    sanctions_df = pd.read_csv(sanctions_csv_path, header=0, encoding='ISO-8859-1')  # or 'cp1252'
    print(f"Successfully loaded {sanctions_csv_path}. Shape: {sanctions_df.shape}")
    print("First 5 rows of raw sanctions data:")
    print(sanctions_df.head())

    # Renaming columns (optional: use if original columns differ from expected names)
    sanctions_df.columns = sanctions_df.columns.str.strip().str.replace(' ', '_').str.title()

    # Clean and standardize key fields
    sanctions_df['Sanctioned_Name'] = sanctions_df['Sanctioned_Name'].astype(str).str.upper().str.strip()
    sanctions_df['Sanctioned_Address'] = sanctions_df['Sanctioned_Address'].astype(str).str.upper().str.strip()
    sanctions_df['Sanctioned_Dob'] = pd.to_datetime(sanctions_df['Sanctioned_Dob'], errors='coerce').dt.strftime('%y')
    sanctions_df['Sanctioned_Nationality'] = sanctions_df['Sanctioned_Nationality'].astype(str).str.upper().str.strip()
    sanctions_df['Sanctioned_Type'] = sanctions_df['Sanctioned_Type'].astype(str).str.upper().str.strip()
    sanctions_df['Sanctioned_Id'] = sanctions_df['Sanctioned_Id'].astype(str)

    # Clean: Drop rows where name is missing or unknown
    sanctions_df_cleaned = sanctions_df[
        (sanctions_df['Sanctioned_Name'] != 'UNKNOWN SANCTIONED NAME') &
        (sanctions_df['Sanctioned_Name'].str.strip() != '') &
        (sanctions_df['Sanctioned_Name'].str.lower() != 'nan')
    ].copy()

    # Final selected columns
    sanctions_df_cleaned = sanctions_df_cleaned[[
        'Sanctioned_Id', 'Sanctioned_Name', 'Sanctioned_Address',
        'Sanctioned_Dob', 'Sanctioned_Nationality', 'Sanctioned_Type'
    ]]

    print("\n✅ Cleaned Sanctions Data (first 5 rows):")
    print(sanctions_df_cleaned.head())

except FileNotFoundError:
    print(f"❌ Error: The file '{sanctions_csv_path}' was not found.")
    sanctions_df_cleaned = pd.DataFrame()

except Exception as e:
    print(f"❌ An error occurred while processing the sanctions CSV: {e}")
    sanctions_df_cleaned = pd.DataFrame()


--- Step 1: Loading Sanctions Data from UK Sanctions List_mean.csv ---
Successfully loaded C:/Users/hp952/UK Sanctions List_mean.csv. Shape: (12377, 6)
First 5 rows of raw sanctions data:
  Sanctioned_ID             Sanctioned_name Sanctioned_Address Sanctioned_DOB  \
0     Cust00001  ZADACHIN ANDREI ANDREEVICH                NaN           1990   
1     Cust00002                         1-P        KAFIA KINGI           1994   
2     Cust00003                         1-P        KAFIA KINGI           1993   
3     Cust00004                         1-P        KAFIA KINGI           1995   
4     Cust00005                         1-P        KAFIA KINGI           1992   

  Sanctioned_Nationality         Sanctioned_Type  
0                 Russia  Primary name variation  
1                      0                     AKA  
2                      0                     AKA  
3                      0                     AKA  
4                      0                     AKA  

✅ Cleaned Sanction

In [86]:


# --- Step 2: Simulating Customer Data (CSV and JSON) ---
print("\n--- Step 2: Simulating Customer Data ---")

def generate_dummy_customer_data(num_customers):
    """Generates a DataFrame of dummy customer data."""
    customers = []
    for i in range(1, num_customers + 1):
        customer_id = f'CUST{i:05d}'
        # Use a single full name from the new list
        customer_name = random.choice(COMMON_FULL_NAMES)
        customer_address = f"{random.randint(100, 999)} {random.choice(['Main St', 'Oak Ave', 'Pine Ln'])}"
        customer_dob = (datetime.date(1950, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 50))).strftime('%Y-%m-%d')
        customer_nationality = random.choice(list(COUNTRY_RISK_MAP.keys()))
        customer_country = random.choice(list(COUNTRY_RISK_MAP.keys()))
        customer_industry = random.choice(['Financial Services', 'Retail', 'Technology', 'Manufacturing', 'Healthcare'])
        onboarding_date = (datetime.date(2020, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 3))).strftime('%Y-%m-%d')

        # Introduce some 'risky' customers that might match sanctions list
        if i % 10 == 0 and not sanctions_df_cleaned.empty: # Every 10th customer, try to make a fuzzy match
            sanctioned_entity = sanctions_df_cleaned.sample(1).iloc[0]
            customer_name = sanctioned_entity['Sanctioned_Name'].replace('A', 'a', 1).replace('E', 'e', 1) # Slight variation
            customer_address = sanctioned_entity['Sanctioned_Address'].replace('ST', 'Street', 1) # Slight variation
            customer_dob = sanctioned_entity['Sanctioned_Dob'] # Exact DOB match
            customer_nationality = sanctioned_entity['Sanctioned_Nationality']
            customer_country = sanctioned_entity['Sanctioned_Nationality'] # Assume country is same as nationality for simplicity

        customers.append({
            'Customer_Id': customer_id,
            'Customer_Name': customer_name,
            'Customer_Address': customer_address,
            'Customer_Dob': customer_dob,
            'Customer_Nationality': customer_nationality,
            'Customer_Country': customer_country,
            'Customer_Industry': customer_industry,
            'Onboarding_Date': onboarding_date
        })
    return pd.DataFrame(customers)

customer_df = generate_dummy_customer_data(NUM_CUSTOMERS)
print(f"Generated {NUM_CUSTOMERS} dummy customer records. Shape: {customer_df.shape}")
print("First 5 rows of dummy customer data:")
print(customer_df.head())

# Save as CSV
customer_df.to_csv('customer_data.csv', index=False)
print("Customer data saved to 'customer_data.csv'")

# Save as JSON (example of another format)
customer_df.to_json('customer_data.json', orient='records', indent=4)
print("Customer data saved to 'customer_data.json'")

# --- Loading Customer Data from JSON (Example) ---
# This simulates loading from a JSON data provider
print("\nLoading customer data from 'customer_data.json' (simulated JSON ingestion):")
customer_df_from_json = pd.read_json('customer_data.json')
print(customer_df_from_json.head(2))



--- Step 2: Simulating Customer Data ---
Generated 12820 dummy customer records. Shape: (12820, 8)
First 5 rows of dummy customer data:
  Customer_Id                 Customer_Name Customer_Address Customer_Dob  \
0   CUST00001           GHARIB FADIL MAHMUD      190 Oak Ave   1959-10-13   
1   CUST00002  MANTUROV DENIS VALENTINOVICH      162 Oak Ave   1991-02-13   
2   CUST00003                   TRINITI JSC      515 Main St   1955-02-19   
3   CUST00004    SAVELYEV OLEG GENRIKHOVICH      191 Oak Ave   1971-12-13   
4   CUST00005     TAYMAZOV ARTUR BORISOVICH      611 Oak Ave   1977-08-01   

  Customer_Nationality Customer_Country   Customer_Industry Onboarding_Date  
0               RUSSIA          SOMALIA  Financial Services      2022-04-05  
1         SOUTH AFRICA             CUBA              Retail      2022-01-04  
2                INDIA          NIGERIA              Retail      2022-02-27  
3                INDIA          GERMANY       Manufacturing      2021-05-28  
4         

In [87]:

# --- Step 3: Simulating Transaction Data (CSV and API) ---
print("\n--- Step 3: Simulating Transaction Data ---")

def generate_dummy_transaction_data(num_transactions, customer_ids):
    """Generates a DataFrame of dummy transaction data."""
    transactions = []
    for i in range(1, num_transactions + 1):
        transaction_id = f'TXN{i:07d}'
        customer_id = random.choice(customer_ids)
        transaction_date = (datetime.date(2024, 1, 1) + datetime.timedelta(days=random.randint(0, 150))).strftime('%Y-%m-%d')
        amount = round(random.uniform(100, 100000), 2)
        currency = random.choice(['USD', 'EUR', 'GBP', 'INR'])
        transaction_type = random.choice(['Wire Transfer', 'ACH', 'Card Payment', 'Cash Deposit', 'Cash Withdrawal'])
        
        # Introduce some 'risky' transactions
        counterparty_name = random.choice(COMMON_FULL_NAMES) # Use the new full names list
        counterparty_country = random.choice(LOW_RISK_COUNTRIES) # Default to low risk

        if random.random() < 0.1: # 10% chance for a high-risk country transaction
            counterparty_country = random.choice(HIGH_RISK_COUNTRIES)
            if random.random() < 0.3 and not sanctions_df_cleaned.empty: # 30% chance for a counterparty name similar to sanctioned
                sanctioned_entity = sanctions_df_cleaned.sample(1).iloc[0]
                counterparty_name = sanctioned_entity['Sanctioned_Name'].replace('O', 'o', 1) # Slight variation

        transactions.append({
            'Transaction_ID': transaction_id,
            'Customer_ID': customer_id,
            'Transaction_Date': transaction_date,
            'Amount': amount,
            'Currency': currency,
            'Counterparty_Name': counterparty_name,
            'Counterparty_Country': counterparty_country,
            'Transaction_Type': transaction_type
        })
    return pd.DataFrame(transactions)

transaction_df = generate_dummy_transaction_data(NUM_TRANSACTIONS, customer_df['Customer_Id'].tolist())
print(f"Generated {NUM_TRANSACTIONS} dummy transaction records. Shape: {transaction_df.shape}")
print("First 5 rows of dummy transaction data:")
print(transaction_df.head())

# Save as CSV
transaction_df.to_csv('transaction_data.csv', index=False)
print("Transaction data saved to 'transaction_data.csv'")

# --- Simulating API Data Provider for Transaction Data ---
# In a real scenario, you'd make an actual HTTP request to an API endpoint.
# Here, we'll simulate an API response by converting our DataFrame to JSON.

def get_transactions_from_api_mock(customer_id=None, limit=100):
    """
    Simulates fetching transaction data from an API.
    In a real API, you'd use requests.get(api_url, params={'customer_id': customer_id, 'limit': limit})
    """
    if customer_id:
        filtered_txns = transaction_df[transaction_df['Customer_ID'] == customer_id].head(limit)
    else:
        filtered_txns = transaction_df.head(limit)
    
    # Simulate API response structure (e.g., JSON)
    return {'status': 'success', 'data': filtered_txns.to_dict(orient='records')}

print("\nSimulating API call for transactions (e.g., for CUST00001):")
api_response = get_transactions_from_api_mock(customer_id='CUST00001', limit=5)
if api_response['status'] == 'success':
    api_transactions_df = pd.DataFrame(api_response['data'])
    print(api_transactions_df)
else:
    print("API call failed (simulated).")




--- Step 3: Simulating Transaction Data ---
Generated 500000 dummy transaction records. Shape: (500000, 8)
First 5 rows of dummy transaction data:
  Transaction_ID Customer_ID Transaction_Date    Amount Currency  \
0     TXN0000001   CUST01799       2024-04-13  67473.47      GBP   
1     TXN0000002   CUST01728       2024-03-13  91786.34      GBP   
2     TXN0000003   CUST10895       2024-04-06  41728.63      INR   
3     TXN0000004   CUST06337       2024-03-30  38780.99      GBP   
4     TXN0000005   CUST10354       2024-01-21  65506.06      EUR   

                                   Counterparty_Name Counterparty_Country  \
0  OPERATING ORGANIZATION OF ZAPORIZHZHIA NUCLEAR...               FRANCE   
1                        ABDIKADIR ABDIKADIR MOHAMED          AFGHANISTAN   
2                                       FREE DONBASS                   UK   
3                           B&H WEST COUNTRY SECTION                   UK   
4                                   BIN MARWAN BILAL      

In [88]:
# --- Step 4: Simulating External Data Sources ---
print("\n--- Step 4: Simulating External Data Sources (Geopolitical Risk, Adverse Media) ---")

# Geopolitical Risk Data (Simple Dictionary Lookup)
# This would typically come from a structured file or API from a risk data provider.
# For simplicity, we'll use our COUNTRY_RISK_MAP directly.
print("\nSimulated Geopolitical Risk Data (from COUNTRY_RISK_MAP):")
for country, risk in COUNTRY_RISK_MAP.items():
    print(f"  {country}: {risk}")

# Adverse Media Screening (AMS) Data (Simulated API/CSV)
# In reality, this would involve searching news articles for a name and getting a sentiment/risk score.
def get_adverse_media_hit_mock(name):
    """Simulates an adverse media screening API call."""
    mock_media_data = {
        'JOHN DOE': {'hit': True, 'severity': 'HIGH', 'keywords': ['fraud', 'sanctions evasion']},
        'JANE SMITH': {'hit': True, 'severity': 'MEDIUM', 'keywords': ['money laundering investigation']},
        'ALIBABA': {'hit': False}, # No hit for this one
        'VLADIMIR PUTIN': {'hit': True, 'severity': 'CRITICAL', 'keywords': ['sanctioned leader', 'corruption']},
        'KIM JONG-UN': {'hit': True, 'severity': 'CRITICAL', 'keywords': ['nuclear program', 'human rights violations']},
    }
    # Normalize name for lookup
    clean_name = name.upper().strip()
    return mock_media_data.get(clean_name, {'hit': False})

print("\nSimulating Adverse Media Screening for 'JOHN DOE':")
print(get_adverse_media_hit_mock('JOHN DOE'))
print("\nSimulating Adverse Media Screening for 'Robert Johnson':")
print(get_adverse_media_hit_mock('Robert Johnson'))



--- Step 4: Simulating External Data Sources (Geopolitical Risk, Adverse Media) ---

Simulated Geopolitical Risk Data (from COUNTRY_RISK_MAP):
  IRAN: HIGH
  NORTH KOREA: HIGH
  SYRIA: HIGH
  CUBA: HIGH
  VENEZUELA: HIGH
  RUSSIA: MEDIUM
  CHINA: MEDIUM
  INDIA: LOW
  USA: LOW
  UK: LOW
  GERMANY: LOW
  FRANCE: LOW
  BRAZIL: MEDIUM
  SOUTH AFRICA: MEDIUM
  NIGERIA: MEDIUM
  AFGHANISTAN: HIGH
  YEMEN: HIGH
  SOMALIA: HIGH
  LEBANON: MEDIUM
  PAKISTAN: MEDIUM

Simulating Adverse Media Screening for 'JOHN DOE':
{'hit': True, 'severity': 'HIGH', 'keywords': ['fraud', 'sanctions evasion']}

Simulating Adverse Media Screening for 'Robert Johnson':
{'hit': False}


In [89]:

# --- Step 5: HTML Table Extraction (Less common for sensitive data, but for demonstration) ---
print("\n--- Step 5: HTML Table Extraction (Demonstration Only) ---")
# This is typically used for publicly available, less sensitive data like stock tables.
# Not recommended for core sanctions or AML data due to lack of reliability and legal issues.

# Example: Simulate a simple HTML table
html_content = """
<html>
<body>
  <h1>Public Data Table</h1>
  <table id="myTable">
    <thead>
      <tr><th>Name</th><th>City</th><th>Status</th></tr>
    </thead>
    <tbody>
      <tr><td>Alice</td><td>New York</td><td>Active</td></tr>
      <tr><td>Bob</td><td>London</td><td>Inactive</td></tr>
      <tr><td><td>Charlie</td><td>Paris</td><td>Active</td></tr>
    </tbody>
  </table>
</body>
</html>
"""

# Using BeautifulSoup to parse HTML and extract table
try:
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', {'id': 'myTable'})
    
    if table:
        headers = [th.text for th in table.find('thead').find_all('th')]
        rows = []
        for tr in table.find('tbody').find_all('tr'):
            rows.append([td.text for td in tr.find_all('td')])
        
        html_df = pd.DataFrame(rows, columns=headers)
        print("\nData extracted from simulated HTML table:")
        print(html_df)
    else:
        print("No table found with id 'myTable' in simulated HTML.")
except Exception as e:
    print(f"Error during HTML parsing: {e}")


# --- Summary of Loaded DataFrames ---
print("\n--- Summary of Loaded DataFrames ---")
print(f"Sanctions Data (cleaned): {sanctions_df_cleaned.shape[0]} rows")
print(f"Customer Data (simulated): {customer_df.shape[0]} rows")
print(f"Transaction Data (simulated): {transaction_df.shape[0]} rows")

print("\nData collection simulation complete. You now have cleaned sanctions data and generated dummy customer and transaction data ready for the next steps.")



--- Step 5: HTML Table Extraction (Demonstration Only) ---
Error during HTML parsing: 3 columns passed, passed data had 4 columns

--- Summary of Loaded DataFrames ---
Sanctions Data (cleaned): 12377 rows
Customer Data (simulated): 12820 rows
Transaction Data (simulated): 500000 rows

Data collection simulation complete. You now have cleaned sanctions data and generated dummy customer and transaction data ready for the next steps.
