In [None]:
import pandas as pd
import re
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import threading
import os
import shutil

# Keywords for NACE classification
nace_keywords = {
    "A": ["Agriculture", "Food", "Palm Oil"],
    "B": ["Mining", "Oil", "Petroleum", "Raw Material"],
    "C13": ["Textiles", "Fashion", "Clothing"],
    "C20": ["Chemicals", "Pharmaceutical"],
    "C22": ["Plastics"],
    "C26": ["Electronics", "Technology"],
    "D": ["Utilities", "Waste", "Energy", "Power"],
    "G": ["Supply Chain", "Retail", "Trade"],
    "H": ["Transport"],
    "K": ["Financial", "Corporate Responsibility", "Investment"]
}

# Trustworthiness scoring
def trust_score(author: str) -> int:
    trusted_high = ["UN", "OECD", "ILO", "EU", "Bundesministerium", "University", "Max Planck"]
    trusted_medium = ["NGO", "Fairtrade", "McKinsey", "Oxfam", "Vision"]
    author = str(author)
    if any(key.lower() in author.lower() for key in trusted_high):
        return 10
    elif any(key.lower() in author.lower() for key in trusted_medium):
        return 7
    return 4

# Check if entry is already shown in the GUI
def entry_already_in_gui(title, author):
    for child in tree.get_children():
        values = tree.item(child)["values"]
        if len(values) >= 2 and values[0] == title and values[1] == author:
            return True
    return False

# Analyze input Excel file
def analyze_file(path):
    try:
        df_input = pd.read_excel(path)
    except Exception as e:
        messagebox.showerror("Error loading file", str(e))
        return

    filename = "nace_source_evaluation.xlsx"
    if os.path.exists(filename):
        df_existing = pd.read_excel(filename)
    else:
        df_existing = pd.DataFrame()

    results = []
    skipped_titles = []

    for _, row in df_input.iterrows():
        title = str(row["Title"])
        author = str(row["Author"])

        if entry_already_in_gui(title, author):
            skipped_titles.append(title)
            continue

        # Assign NACE codes
        matched_codes = []
        for code, keywords in nace_keywords.items():
            if any(re.search(rf"\b{kw}\b", title, re.IGNORECASE) for kw in keywords):
                matched_codes.append(code)
        nace_codes = ", ".join(matched_codes) if matched_codes else "Undefined"

        trust = trust_score(author)
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")

        results.append({
            "Title": title,
            "Author": author,
            "NACE_Assignment": nace_codes,
            "Trust_Score": trust,
            "Timestamp": timestamp
        })

    if skipped_titles:
        info_text = "The following titles have already been analyzed and were skipped:\n\n" + "\n".join(skipped_titles)
        messagebox.showinfo("Info", info_text)

    if not results:
        return

    df_output = pd.DataFrame(results)

    if not df_existing.empty:
        df_total = pd.concat([df_existing, df_output], ignore_index=True)
    else:
        df_total = df_output

    df_total.to_excel(filename, index=False)
    return df_output

# Load and process file from GUI
def load_file():
    path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    if not path:
        return

    df = analyze_file(path)
    if df is not None:
        for _, row in df.iterrows():
            title = row["Title"]
            author = row["Author"]
            if not entry_already_in_gui(title, author):
                tree.insert("", tk.END, values=list(row))

        processed_folder = "processed"
        os.makedirs(processed_folder, exist_ok=True)

        filename = os.path.basename(path)
        target_path = os.path.join(processed_folder, filename)

        try:
            shutil.move(path, target_path)
        except Exception as e:
            print(f"Error moving file: {e}")

        messagebox.showinfo("Done", "Analysis completed and saved as 'nace_source_evaluation.xlsx'")

# GUI setup
root = tk.Tk()
root.title("NACE Source Analysis Tool")
root.geometry("1200x600")
root.configure(bg="#f0f4f8")

frame_top = tk.Frame(root, bg="#f0f4f8")
frame_top.pack(pady=20)

lbl_title = tk.Label(frame_top, text="NACE Classification and Trust Evaluation", font=("Arial", 16, "bold"), bg="#f0f4f8")
lbl_title.pack(pady=5)

btn_load = tk.Button(frame_top, text="Load & Analyze Excel File", command=load_file, font=("Arial", 12), bg="#4CAF50", fg="white", padx=10, pady=5)
btn_load.pack()

frame_table = tk.Frame(root)
frame_table.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)

columns = ["Title", "Author", "NACE_Assignment", "Trust_Score", "Timestamp"]
tree = ttk.Treeview(frame_table, columns=columns, show="headings", height=20)

style = ttk.Style()
style.configure("Treeview.Heading", font=("Arial", 11, "bold"))
style.configure("Treeview", font=("Arial", 10))

for col in columns:
    tree.heading(col, text=col)
    tree.column(col, anchor="w", width=230)
tree.pack(fill=tk.BOTH, expand=True)

# Watchdog handler
class ExcelHandler(FileSystemEventHandler):
    def on_created(self, event):
        if event.src_path.endswith(".xlsx"):
            df = analyze_file(event.src_path)
            if df is not None:
                tree.after(0, update_gui, df)
                processed_folder = "processed"
                os.makedirs(processed_folder, exist_ok=True)
                filename = os.path.basename(event.src_path)
                target_path = os.path.join(processed_folder, filename)
                try:
                    shutil.move(event.src_path, target_path)
                except Exception as e:
                    print(f"Error moving file: {e}")

# Update GUI with new data
def update_gui(df):
    for _, row in df.iterrows():
        title = row["Title"]
        author = row["Author"]
        if not entry_already_in_gui(title, author):
            tree.insert("", tk.END, values=list(row))

# Folder monitoring
def monitor_folder(path):
    event_handler = ExcelHandler()
    observer = Observer()
    observer.schedule(event_handler, path=path, recursive=False)
    observer.start()

monitor_thread = threading.Thread(
    target=monitor_folder, 
    args=("incoming_sources",), 
    daemon=True
)
monitor_thread.start()

root.mainloop()
