# save pdf to csv success

In [30]:
import requests
from bs4 import BeautifulSoup
import PyPDF2
import pandas as pd
from urllib.parse import urljoin

# Base URL of the webpage with PDF links
base_url = "https://www.bancaditalia.it/compiti/vigilanza/provvedimenti-sanzionatori/index.html?page="

# Initialize an empty list to store all PDF links
all_pdf_links = []

# Loop through all pages
for page_num in range(1, 19):  # Assuming there are 18 pages(be 19 for next try)
    # Construct the URL for the current page
    page_url = base_url + str(page_num)

    # Send a GET request to the webpage
    response = requests.get(page_url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all PDF links on the current page
    for link in soup.find_all('a', href=True):
        pdf_link = link['href']
        if pdf_link.startswith('/compiti/vigilanza/provvedimenti-sanzionatori/documenti/'):
            # Join the relative URL with the base URL
            pdf_link = urljoin('https://www.bancaditalia.it', pdf_link)
            all_pdf_links.append(pdf_link)

# Create a Pandas DataFrame to store the extracted text content
df = pd.DataFrame(columns=['PDF_Link', 'Text_Content'])

# Loop through each PDF link and extract text content
for pdf_link in all_pdf_links:
    # Send a GET request to the PDF link
    pdf_response = requests.get(pdf_link, stream=True)

    # Open the PDF file in binary mode
    with open('temp.pdf', 'wb') as f:
        f.write(pdf_response.content)

    # Read the PDF file using PyPDF2
    with open('temp.pdf', 'rb') as f:
        pdf = PyPDF2.PdfReader(f)
        text_content = ''
        for page in pdf.pages:
            text_content += page.extract_text()

    # Add the extracted text content to the DataFrame
    new_row = {'PDF_Link': pdf_link, 'Text_Content': text_content}
    df.loc[len(df)] = new_row

    # Remove the temporary PDF file
    import os
    os.remove('temp.pdf')

# Save the DataFrame to a CSV file with utf-8 encoding
df.to_csv('pdf_text_content.csv', index=False, encoding='utf-8-sig')

# df.to_json('pdf_text_content.json', orient='records', lines=True, force_ascii=False)


# local llmFull code without date

In [18]:
import pandas as pd
import requests

# Set the LM Studio server URL and port
lmstudio_url = "http://localhost:1234/v1"

# Define a function to send a request to the LM Studio server with token limit
def extract_sanction_info(text):
    max_tokens = 200  # Appropriate token limit based on previous tests
    sanction_info = []

    url = f"{lmstudio_url}/chat/completions"
    prompt = f"Please provide a concise, structured format: Sanctioned entity, amount, and reason based on the following text: \n\n{text}"
    payload = {
        "model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0,
        "max_tokens": max_tokens
    }

    try:
        response = requests.post(url, json=payload, headers={"Authorization": "Bearer lm-studio"})
        response_json = response.json()
    except requests.RequestException as e:
        print("Request failed:", e)
        return sanction_info  # Return empty list if there is a request issue

    if "choices" in response_json and len(response_json["choices"]) > 0:
        completion_text = response_json["choices"][0]["message"]["content"]
        for line in completion_text.splitlines():
            line = line.strip()
            if line.startswith("* Sanctioned Entity:"):
                sanction_info.append(line.split(":")[1].strip())
            elif line.startswith("* Amount:"):
                sanction_info.append(line.split(":")[1].strip())
            elif line.startswith("* Reason:"):
                reason = line.split(":")[1].strip()
                sanction_info.append(reason)

    return sanction_info

# Load the CSV file
df = pd.read_csv('pdf_text_content.csv')

# Apply the extraction function to each row in the DataFrame
# Assuming the relevant text is in the second column of the DataFrame
df['Sanction_Info'] = df.iloc[:, 1].apply(extract_sanction_info)

# Print the resulting DataFrame with extracted information
print(df)


                                            PDF_Link  \
0  https://www.bancaditalia.it/compiti/vigilanza/...   
1  https://www.bancaditalia.it/compiti/vigilanza/...   
2  https://www.bancaditalia.it/compiti/vigilanza/...   
3  https://www.bancaditalia.it/compiti/vigilanza/...   
4  https://www.bancaditalia.it/compiti/vigilanza/...   
5  https://www.bancaditalia.it/compiti/vigilanza/...   
6  https://www.bancaditalia.it/compiti/vigilanza/...   
7  https://www.bancaditalia.it/compiti/vigilanza/...   
8  https://www.bancaditalia.it/compiti/vigilanza/...   
9  https://www.bancaditalia.it/compiti/vigilanza/...   

                                        Text_Content  \
0                                                ...   
1                                                ...   
2                                                ...   
3                                                ...   
4                                                ...   
5                                              

make last column in separate columns

In [29]:
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

lmstudio_url = "http://localhost:1234/v1"

def extract_sanction_info(text):
    max_tokens = 200
    url = f"{lmstudio_url}/chat/completions"
    prompt = f"Please provide a concise, structured format: Sanctioned entity, amount, and reason based on the following text: \n\n{text}"
    payload = {
        "model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0,
        "max_tokens": max_tokens
    }
    try:
        response = requests.post(url, json=payload, headers={"Authorization": "Bearer lm-studio"})
        response_json = response.json()
        return response_json
    except requests.RequestException as e:
        print(f"Request failed for {text[:30]}...: {str(e)}")
        return None

def process_text_data(text_data):
    entity = ""
    amount = 0
    reason = ""
    if text_data:
        for choice in text_data.get("choices", []):
            completion_text = choice["message"]["content"]
            for line in completion_text.splitlines():
                line = line.strip()
                if line.startswith("* Sanctioned Entity:"):
                    entity = line.split(":")[1].strip()
                elif line.startswith("* Amount:"):
                    # Remove currency symbols and convert to float
                    amount_str = line.split(":")[1].strip()
                    amount = float(amount_str.replace('€', '').replace(',', '').strip())
                elif line.startswith("* Reason:"):
                    reason = line.split(":")[1].strip()
    return entity, amount, reason

df = pd.read_csv('pdf_text_content.csv')
texts = df.iloc[:, 1].tolist()

results = []

with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(extract_sanction_info, text): text for text in texts}
    for future in as_completed(futures):
        response_data = future.result()
        entity, amount, reason = process_text_data(response_data)
        results.append((entity, amount, reason))

# Assuming results is a list of tuples (entity, amount, reason)
df_result = pd.DataFrame(results, columns=['Sanctioned_Entity', 'Sanctioned_Amount', 'Sanction_Reason'])
# Combine the original DataFrame with the results
df_final = pd.concat([df, df_result], axis=1)

print(df_final)


                                            PDF_Link  \
0  https://www.bancaditalia.it/compiti/vigilanza/...   
1  https://www.bancaditalia.it/compiti/vigilanza/...   
2  https://www.bancaditalia.it/compiti/vigilanza/...   
3  https://www.bancaditalia.it/compiti/vigilanza/...   
4  https://www.bancaditalia.it/compiti/vigilanza/...   
5  https://www.bancaditalia.it/compiti/vigilanza/...   
6  https://www.bancaditalia.it/compiti/vigilanza/...   
7  https://www.bancaditalia.it/compiti/vigilanza/...   
8  https://www.bancaditalia.it/compiti/vigilanza/...   
9  https://www.bancaditalia.it/compiti/vigilanza/...   

                                        Text_Content  \
0                                                ...   
1                                                ...   
2                                                ...   
3                                                ...   
4                                                ...   
5                                              

# local llmFull code with date(to retry)

In [18]:
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import re

lmstudio_url = "http://localhost:1234/v1"

def clean_amount(amount_str):
    # Debug: Print original amount string
    print("Original Amount String:", amount_str)
    # Remove euro sign and spaces
    cleaned_str = amount_str.replace('€', '').replace(' ', '')
    # Remove all non-numeric characters except digits and commas or dots if they act as thousands separators
    cleaned_str = re.sub(r'[^\d,.]', '', cleaned_str)

    # Remove thousands separators and prepare for summing
    # Assume any ',' or '.' followed by exactly three digits is a thousands separator
    cleaned_numbers = re.sub(r'(\d)[,.](\d{3})(?=\b|\D)', r'\1\2', cleaned_str)

    # Find all separate numbers (assuming they are full integers)
    numbers = re.findall(r'\d+', cleaned_numbers)

    # Convert all found numbers to integers and sum them
    total_amount = sum(int(num) for num in numbers)
    print("Aggregated Amount:", total_amount)
    return total_amount


def extract_sanction_info(text):
    max_tokens = 100
    url = f"{lmstudio_url}/chat/completions"
    prompt = (f"Text: {text}\n"
              "Please extract and return:\n"
              "- Date of Sanction:\n"
              "- Sanctioned Entity:\n"
              "- Amount:\n"
              "- Reason of Sanction (less than 20 words):\n"
              "- Category of Reason(one key words) :\n"
              "- analyze and assign the category of reason, if not found assign other.\n"
              "- When multi num value, aggregate.\n"
              "- Use corp name to represent, not individual name.\n"
              "- Stop when all info obtained.\n")
    payload = {
        "model": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0,
        "max_tokens": max_tokens,
        "stop_sequences": ["\n\n"]
    }
    response = requests.post(url, json=payload, headers={"Authorization": "Bearer lm-studio"})
    data = response.json()
    # Debug: Print full response data
    print("Response Data:", data)
    return data

def process_text_data(text_data):
    entity, amount, reason, date_sanction, category_reason = "", 0, "", None, ""
    
    if text_data and 'choices' in text_data:
        completion_text = text_data['choices'][0]['message']['content']
        print("Completion Text:", completion_text)  # Debugging
        
        for line in completion_text.splitlines():
            if "Date of Sanction:" in line:
                date_str = line.split(":")[1].strip()
                try:
                    date_sanction = datetime.strptime(date_str, '%d.%m.%Y')
                    print("Parsed Date of Sanction:", date_sanction)
                except ValueError as e:
                    print("Date parsing error:", e)
            elif "Sanctioned Entity:" in line:
                entity = line.split(":")[1].strip()
                print("Parsed Sanctioned Entity:", entity)
            elif "Amount:" in line:
                amount_str = line.split(":")[1].strip()
                amount = clean_amount(amount_str)
            elif "Reason of Sanction" in line:
                parts = line.split("Reason of Sanction")
                if len(parts) > 1:
                    reason = parts[1].split(":", 1)[1].strip() if ':' in parts[1] else parts[1].strip()
                else:
                    reason = "Reason not specified or improperly formatted"
                print("Parsed Reason of Sanction:", reason)
            elif "Category of Reason" in line:
                parts = line.split("Category of Reason")
                if len(parts) > 1:
                    category_reason = parts[1].split(":", 1)[1].strip() if ':' in parts[1] else parts[1].strip()
                else:
                    category_reason = "Category not specified or improperly formatted"
                print("Parsed Category of Reason:", category_reason)
                
    return entity, amount, reason, date_sanction, category_reason



df = pd.read_csv('pdf_text_content.csv')
# texts = df.iloc[:10, 1].tolist()  # Only select the first 1 records


texts = df.iloc[:, 1].tolist() #COMPLETE 


results = []
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(extract_sanction_info, text) for text in texts]
    for future in as_completed(futures):
        response_data = future.result()
        result = process_text_data(response_data)
        print("Processed Result:", result)
        results.append(result)

# Extracting the additional column from the tuple result
df_result = pd.DataFrame(results, columns=['Sanctioned_Entity', 'Sanctioned_Amount', 'Sanction_Reason', 'Date_of_Sanction', 'Category_of_Reason'])
print(df_result)



Response Data: {'id': 'chatcmpl-g8g4fk46cf7v87bcd7kk9j', 'object': 'chat.completion', 'created': 1714064651, 'model': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q2_K.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': ' Date of Sanction: 6.2.2024\nSanctioned Entity: Generalfinance SpA\nAmount: €30,000\nReason of Sanction: Violations of anti-money laundering and counter-terrorist financing regulations\nCategory of Reason: Anti-Money Laundering/Counter-Terrorist Financing (AML/CTF)'}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 88, 'completion_tokens': 88, 'total_tokens': 176}}
Completion Text:  Date of Sanction: 6.2.2024
Sanctioned Entity: Generalfinance SpA
Amount: €30,000
Reason of Sanction: Violations of anti-money laundering and counter-terrorist financing regulations
Category of Reason: Anti-Money Laundering/Counter-Terrorist Financing (AML/CTF)
Parsed Date of Sanction: 2024-02-06 00:00:00
Parsed Sanctioned Entity: Generalfina

compute
24.7 s for 10 records, model bartowski/stable-code-instruct-3b-GGUF/stable-code-instruct-3b-Q8_0.gguf

## NEW categorical stack

In [21]:
import plotly.express as px
import pandas as pd

# Ensure Date_of_Sanction is in datetime format
df_result['Date_of_Sanction'] = pd.to_datetime(df_result['Date_of_Sanction'])

# Extract the month from the Date_of_Sanction column
df_result['Month'] = df_result['Date_of_Sanction'].dt.to_period('M')

# Group by Month and Category_of_Reason, summing the sanctioned amounts
df_reason_monthly = df_result.groupby(['Month', 'Category_of_Reason'])['Sanctioned_Amount'].sum().reset_index()

# Convert the Month back to string for better plotting
df_reason_monthly['Month'] = df_reason_monthly['Month'].astype(str)

# Plot the stacked bar chart
fig = px.bar(df_reason_monthly, x='Month', y='Sanctioned_Amount', color='Category_of_Reason',
             title="Monthly Sanctioned Amounts by Category of Sanction",
             labels={"Month": "Month of Sanction", "Sanctioned_Amount": "Sanctioned Amount (€)",
                     "Category_of_Reason": "Category of Sanction"})
fig.update_layout(
    width=1200,  # Set a fixed width of 800 pixels
    height=500  # Set a fixed height
)

fig.show()


## group by first word of category

In [27]:
import plotly.express as px
import pandas as pd

# Ensure Date_of_Sanction is in datetime format
df_result['Date_of_Sanction'] = pd.to_datetime(df_result['Date_of_Sanction'])

# Extract the month from the Date_of_Sanction column
df_result['Month'] = df_result['Date_of_Sanction'].dt.to_period('M')

# Extract the first word from the Category_of_Reason column
df_result['First_Word'] = df_result['Category_of_Reason'].str.split().str[0]

# Group by Month and the first word of Category_of_Reason, summing the sanctioned amounts
df_reason_monthly = df_result.groupby(['Month', 'First_Word'])['Sanctioned_Amount'].sum().reset_index()

# Convert the Month back to string for better plotting
df_reason_monthly['Month'] = df_reason_monthly['Month'].astype(str)

# Plot the stacked bar chart
fig = px.bar(df_reason_monthly, x='Month', y='Sanctioned_Amount', color='First_Word',
             title="Monthly Sanctioned Amounts by First Word of Category of Sanction",
             labels={"Month": "Month of Sanction", "Sanctioned_Amount": "Sanctioned Amount (€)",
                     "First_Word": "First Word of Category"})
fig.update_layout(
    width=800,  # Set a fixed width of 800 pixels
    height=500  # Set a fixed height
)

fig.show()


## dynamic scatter plot

In [23]:
import plotly.express as px

fig = px.scatter(df_result, x='Date_of_Sanction', y='Sanctioned_Amount',
                 size='Sanctioned_Amount', 
                 color='Sanctioned_Entity',
                 hover_name='Sanctioned_Entity', size_max=60,
                 title="Sanction Amounts Over Time by Entity",
                 labels={"Date_of_Sanction": "Date of Sanction", "Sanctioned_Amount": "Sanctioned Amount (€)"})
fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))
fig.update_layout(
    width=1200,  # Set a fixed width
    height=500  # Set a fixed height
)
fig.show()


## monthly trend data

In [24]:
import plotly.graph_objects as go
import plotly.express as px

def truncate_legend(label):
    max_length = 20
    return label[:max_length] + '...' if len(label) > max_length else label

# Convert 'Date_of_Sanction' column to datetime
df_result['Date_of_Sanction'] = pd.to_datetime(df_result['Date_of_Sanction'])

# Extract month and year from 'Date_of_Sanction' column
df_result['Month'] = df_result['Date_of_Sanction'].dt.strftime('%Y-%m')

# Group by month and entity, summing the sanctioned amounts
df_monthly = df_result.groupby(['Month', 'Sanctioned_Entity'])['Sanctioned_Amount'].sum().reset_index()

# Group by month and count the number of sanctions
sanction_count = df_result.groupby('Month').size().reset_index(name='Sanction_Count')

# Plot the stacked bar chart
fig = px.bar(df_monthly, x='Month', y='Sanctioned_Amount', color='Sanctioned_Entity',
             title="Monthly Sanctioned Amounts by Entity",
             labels={"Month": "Month of Sanction", "Sanctioned_Amount": "Sanctioned Amount (€)"})
fig.update_layout(
    width=800,  # Set a fixed width of 800 pixels
    height=500,  # Set a fixed height
)

# Add line chart for monthly count on the right side
fig.add_trace(
    go.Scatter(x=sanction_count['Month'], y=sanction_count['Sanction_Count'], 
               mode='lines', name='Sanction Count', yaxis='y2')
)

# Update layout to include secondary y-axis
fig.update_layout(
    yaxis2=dict(
        title='Sanction Count',  # Label for secondary y-axis
        overlaying='y',  # Overlay the secondary y-axis on the primary one
        side='right'  # Position the secondary y-axis on the right side
    )
)

# Truncate legend labels
fig.for_each_trace(lambda trace: trace.update(name=truncate_legend(trace.name)))

fig.show()


## yearly trend data

In [25]:
import plotly.express as px
from natsort import natsorted  # Import natural sorting function

def truncate_legend(label):
    max_length = 20
    return label[:max_length] + '...' if len(label) > max_length else label

# Convert 'Date_of_Sanction' column to datetime
df_result['Date_of_Sanction'] = pd.to_datetime(df_result['Date_of_Sanction'])

# Extract year from 'Date_of_Sanction' column
df_result['Year'] = df_result['Date_of_Sanction'].dt.strftime('%Y')

# Group by year and entity, summing the sanctioned amounts
df_yearly = df_result.groupby(['Year', 'Sanctioned_Entity'])['Sanctioned_Amount'].sum().reset_index()

# Convert 'Year' column to integers
df_yearly['Year'] = df_yearly['Year'].astype(int)

# Sort the DataFrame by the 'Year' column
df_yearly = df_yearly.sort_values(by='Year')

# Group by year and count the number of sanctions
sanction_count = df_result.groupby('Year').size().reset_index(name='Sanction_Count')

# Plot the stacked bar chart
fig = px.bar(df_yearly, x='Year', y='Sanctioned_Amount', color='Sanctioned_Entity',
             title="Yearly Sanctioned Amounts by Entity",
             labels={"Year": "Year of Sanction", "Sanctioned_Amount": "Sanctioned Amount (€)"})

# Add line chart for yearly count
fig.add_scatter(x=sanction_count['Year'], y=sanction_count['Sanction_Count'], 
                mode='lines', name='Sanction Count', yaxis='y2')  # Specify y2 for secondary y-axis

fig.update_layout(
    width=800,  # Set a fixed width of 400 pixels
    height=500,  # Set a fixed height
    yaxis2=dict(
        title='Sanction Count',  # Label for secondary y-axis
        overlaying='y',  # Overlay the secondary y-axis on the primary one
        side='right'  # Position the secondary y-axis on the right side
    )
)

# Truncate legend labels
fig.for_each_trace(lambda trace: trace.update(name=truncate_legend(trace.name)))

fig.show()


# option of text searching

In [29]:
import pandas as pd

# Load the CSV file to examine its contents
data = pd.read_csv('pdf_text_content.csv')

# Display the first few rows of the dataframe and the column names to understand its structure
data.head(), data.columns


(                                            PDF_Link  \
 0  https://www.bancaditalia.it/compiti/vigilanza/...   
 1  https://www.bancaditalia.it/compiti/vigilanza/...   
 2  https://www.bancaditalia.it/compiti/vigilanza/...   
 3  https://www.bancaditalia.it/compiti/vigilanza/...   
 4  https://www.bancaditalia.it/compiti/vigilanza/...   
 
                                         Text_Content  
 0                                                ...  
 1                                                ...  
 2                                                ...  
 3                                                ...  
 4                                                ...  ,
 Index(['PDF_Link', 'Text_Content'], dtype='object'))

In [3]:
import re


In [30]:
import datetime

# Function to normalize date format to YYYY-MM-DD
def normalize_date(date_str):
    for fmt in ("%d.%m.%Y", "%d/%m/%Y", "%d-%m-%Y"):
        try:
            return datetime.datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except ValueError:
            continue
    return None  # Return None if no format matches

# Revised function to handle amount conversion and ensure proper date formatting
def extract_sanction_details_revised(text):
    # Regex pattern to extract the sanctioned entity, sanction amount, and sanction date
    entity_pattern = re.compile(r"(\b[A-Z\s]+ SPA\b)")
    amount_pattern = re.compile(r"€\s?([\d.,]+)")
    reason_pattern = re.compile(r"irregolarità(?:\sdi\sseguito\sindicate|:\s|sopra\sindicate)(?:\sdi\sseguito\sindicate|:\s|sopra\sindicate)?\s([^–]+)")
    date_pattern = re.compile(r"(\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b)")

    # Extracting data using regex
    entity = re.search(entity_pattern, text)
    amount = re.search(amount_pattern, text)
    reason = re.search(reason_pattern, text)
    date = re.search(date_pattern, text)

    # Extract entity, amount, reason, and date if available
    entity = entity.group(1).strip() if entity else None
    amount = amount.group(1).replace(' ', '').replace(',', '.') if amount else None
    amount = float(amount.replace('.', '', amount.count('.') - 1).strip('.')) if amount else None  # Remove thousands separator, keep decimal
    reason = reason.group(1).strip() if reason else None
    date = normalize_date(date.group(1).strip()) if date else None

    return entity, amount, reason, date

# Apply the revised extraction function to the dataframe
extracted_data_revised = data['Text_Content'].apply(lambda x: extract_sanction_details_revised(x))

# Create a new dataframe with the extracted information
sanction_details_df_revised = pd.DataFrame(extracted_data_revised.tolist(), columns=['Sanctioned Entity', 'Sanction Amount', 'Sanction Reason', 'Sanction Date'])

# Save the new dataframe to a CSV file
output_file_path_revised = 'Sanction_Details_with_Date.csv'
sanction_details_df_revised.to_csv(output_file_path_revised, index=False)

sanction_details_df_revised.head(), sanction_details_df_revised.isnull().sum(), output_file_path_revised


(    Sanctioned Entity  Sanction Amount  \
 0  GENERALFINANCE SPA              3.0   
 1  GENERALFINANCE SPA          30000.0   
 2   SOLUTION BANK SPA              3.0   
 3                None             14.0   
 4                None          75000.0   
 
                                      Sanction Reason Sanction Date  
 0  : \n- carenze in materia di adeguata verifica,...    2019-03-26  
 1  : \n- omesse/errate segnalazioni all’Organismo...    2024-02-06  
 2                                               None    2019-03-26  
 3  : \n- carenze nell’organizzazione e nei contro...    2024-03-01  
 4                                               None    2024-01-09  ,
 Sanctioned Entity    75
 Sanction Amount      21
 Sanction Reason      95
 Sanction Date         5
 dtype: int64,
 'Sanction_Details_with_Date.csv')