In [3]:
import pandas as pd
import re
import tkinter as tk
from datetime import datetime
from tkinter import filedialog, messagebox, ttk
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import threading
import os
import shutil
from newspaper import Article
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Ensure model directory exists
os.makedirs("models", exist_ok=True)

# Load AI
vectorizer = joblib.load("models/nace_vectorizer.pkl")
nace_model = joblib.load("models/nace_model.pkl")


# Combined risk keyword dictionary: English and German
risk_keywords_combined = {
    "Severity": {
        "HIGH": ["critical", "severe", "catastrophic", "unacceptable", "kritisch", "schwerwiegend", "katastrophal", "inakzeptabel"],
        "MEDIUM": ["moderate", "considerable", "noticeable", "moderat", "erheblich", "merklich"],
        "LOW": ["minor", "negligible", "minimal", "geringfügig", "unbedeutend", "minimal"]
    },
    "Probability": {
        "HIGH": ["likely", "frequent", "common", "probable", "wahrscheinlich", "häufig", "regelmäßig", "oft"],
        "MEDIUM": ["possible", "occasional", "might happen", "möglich", "gelegentlich", "denkbar"],
        "LOW": ["unlikely", "rare", "seldom", "unwahrscheinlich", "selten", "kaum"]
    },
    "Scope": {
        "GLOBAL": ["worldwide", "global", "international", "weltweit", "global", "international"],
        "REGIONAL": ["regional", "continental", "european", "asian", "regional", "kontinental", "europaweit"],
        "LOCAL": ["local", "community", "municipal", "national", "lokal", "gemeindebezogen", "kommunal", "national"]
    },
    "Urgency": {
        "IMMEDIATE": ["urgent", "immediately", "as soon as possible", "critical", "sofort", "dringend", "unverzüglich", "kritisch"],
        "MEDIUM TERM": ["soon", "in the near future", "within months", "zeitnah", "in nächster zeit", "innerhalb von monaten"],
        "LONG TERM": ["long term", "future", "eventually", "in the coming years", "langfristig", "zukünftig", "in den kommenden jahren"]
    },
    "NACE_Relevant": {
        "HIGH": ["direct impact", "core industry", "sector-critical", "direkte auswirkung", "zentrale branche", "branchenkritisch"],
        "MEDIUM": ["related sector", "adjacent industry", "verwandte branche", "benachbarter sektor"],
        "LOW": ["minor connection", "indirect relevance", "geringe verbindung", "indirekte relevanz"]
    },
    "Company_Influence": {
        "HIGH": ["company decision", "internal policy", "corporate governance", "unternehmensentscheidung", "interne richtlinie", "konzernführung"],
        "MEDIUM": ["partnership", "supply chain", "influence", "partnerschaft", "lieferkette", "einflussnahme"],
        "LOW": ["external", "third party", "limited control", "externe stelle", "drittanbieter", "begrenzte kontrolle"]
    },
    "Data_Availability": {
        "HIGH": ["detailed report", "comprehensive", "full access", "detaillierter bericht", "umfassend", "vollständige daten"],
        "MEDIUM": ["partial data", "limited insights", "some statistics", "teilweise daten", "eingeschränkter einblick", "einige statistiken"],
        "LOW": ["no data", "not disclosed", "incomplete information", "keine daten", "nicht veröffentlicht", "unvollständig"]
    },
    "Regulation": {
        "HIGH": ["strict law", "mandatory", "compliance required", "strenges gesetz", "verbindlich", "compliance erforderlich"],
        "MEDIUM": ["recommended", "framework", "some regulation", "empfohlen", "rahmenbedingungen", "teilweise reguliert"],
        "LOW": ["voluntary", "no legal requirement", "unregulated", "freiwillig", "keine gesetzliche pflicht", "unreguliert"]
    }
}

# Function to guess risk parameter levels from text (EN + DE)
def guess_risk_parameters_from_text(text):
    result = {}
    lower_text = text.lower()

    for param, levels in risk_keywords_combined.items():
        found_level = "UNKNOWN"
        for level, keywords in levels.items():
            for kw in keywords:
                if re.search(rf"\b{re.escape(kw)}\b", lower_text):
                    found_level = level
                    break
            if found_level != "UNKNOWN":
                break
        result[param] = found_level
    return result

# Keywords for NACE classification
nace_keywords = {
    "A1": [
        "agriculture", "crop", "palm oil", "rice", "cotton", "cocoa", "coffee", "sugar",
        "landwirtschaft", "ackerbau", "palmenöl", "reis", "baumwolle", "kakao", "kaffee", "zucker",
        "palm", "palmöl", "kinderarbeit", "child labour", "child labor", "zwangsarbeit", "forced labour", "forced labor",
        "arbeitsbedingungen", "poverty", "armut", "racism", "rassismus" "tobacco", "tabak", "reisanbau",
        "fairtrade", "fair trade", "slavery", "sklaverei", "human rights", "menschenrechte", "human cost"
    ],
    "A2": [
        "forestry", "wood", "logging", "timber",
        "forstwirtschaft", "holz", "abholzung", "forst", "holzernte"
    ],
    "A3": [
        "fishing", "aquaculture", "fish", "shrimp",
        "fischerei", "aquakultur", "fisch", "garnelen"
    ],
    "C10": [
        "cocoa", "chocolate", "coffee", "tea", "sugar", "food",
        "kakao", "schokolade", "kaffee", "tee", "zucker", "lebensmittel", "nahrungsmittel",
        "palmöl", "palm oil", "kinderarbeit", "child labour", "child labor",
        "zwangsarbeit", "forced labour", "forced labor"
    ],
    "C12":  [
        "tobacco", "tabak", "cigar", "zigarre", "zigarette", "cigarette"
    ],
    "C13": [
        "textile", "fabric", "yarn", "garment",
        "textilien", "stoff", "garn", "gewebe", "textil",
        "kinderarbeit", "child labour", "child labor", "zwangsarbeit", "forced labour", "forced labor",
        "arbeitsbedingungen", "wages", "löhne", "poverty", "armut", "racism", "rassismus",
        "fairtrade", "fair trade", "slavery", "sklaverei", "human rights", "menschenrechte", "human cost"
    ],
    "C14": [
        "clothing", "apparel", "garment", "t-shirt", "shirt", "jeans",
        "kleidung", "bekleidung", "oberbekleidung", "hemd",
        "kinderarbeit", "child labour", "child labor", "zwangsarbeit", "forced labour", "forced labor",
        "arbeitsbedingungen", "wages", "löhne", "poverty", "armut", "racism", "rassismus",
        "fairtrade", "fair trade", "slavery", "sklaverei", "human rights", "menschenrechte", "human cost"
    ],
    "C15": [
        "leather", "shoe", "footwear",
        "leder", "schuh", "schuhe", "fußbekleidung"
    ],
    "C20": [
        "chemical", "pharmaceutical", "fertilizer", "pesticide", "plastic",
        "chemie", "chemikalien", "dünger", "pestizid", "kunststoff"
    ],
    "C21": [
        "pharma", "pharmaceutical", "medicine",
        "pharmazeutisch", "medizin", "arznei", "medikament"
    ],
    "C22": [
        "plastic product", "plastic",
        "kunststoff", "plastik", "kunststoffprodukt"
    ],
    "C26": [
        "electronic", "semiconductor", "battery",
        "elektronik", "elektronisch", "halbleiter", "batterie"
    ],
    "D35": [
        "energy", "electricity", "power",
        "energie", "strom", "elektrizität"
    ],
    "E38": [
        "waste", "wastes", "recycling", "disposal",
        "abfall", "entsorgung"
    ],
    "E39": [
        "remediation", "cleanup", "waste", "wastes",
        "sanierung", "reinigung", "aufbereitung", "abfall"
    ],
    "H49": [
        "transport", "freight", "truck", "rail", "logistics", "infrastructure",
        "fracht", "lkw", "bahn", "schiene", "logistik", "infrastruktur", "lieferkette"
    ],
    "H50": [
        "shipping", "sea freight", "infrastructure",
        "schifffahrt", "seehandel", "seefracht", "infrastruktur"
    ],
    "H52": [
        "warehouse", "storage", "logistics",
        "lager", "lagerung", "logistik", "lagerhaltung"
    ],
    "G46": [
        "wholesale", "trade",
        "großhandel", "handel"
    ],
    "G47": [
        "retail", "store", "shop",
        "einzelhandel", "laden", "geschäft"
    ]
}


def get_domain_name(url):
    """Extract domain as source/publisher fallback"""
    domain = urlparse(url).netloc
    return domain.replace("www.", "")

def extract_meta_author(soup):
    """Try to get author/publisher info from meta tags"""
    meta_author = soup.find("meta", {"name": "author"})
    if meta_author and meta_author.get("content"):
        return meta_author["content"].strip()

    og_site = soup.find("meta", {"property": "og:site_name"})
    if og_site and og_site.get("content"):
        return og_site["content"].strip()

    publisher = soup.find("meta", {"name": "publisher"})
    if publisher and publisher.get("content"):
        return publisher["content"].strip()

    return "Unknown"

def extract_publish_date(soup):
    """Try to extract publish date from meta tags"""
    tag = soup.find("meta", {"property": "article:published_time"})
    if tag and tag.get("content"):
        return tag["content"][:10]  # YYYY-MM-DD
    return None

def fallback_title(soup):
    """Try to extract title if article.title fails"""
    og_title = soup.find("meta", {"property": "og:title"})
    if og_title and og_title.get("content"):
        return og_title["content"]
    if soup.title:
        return soup.title.string.strip()
    return "Unknown"

# Trustworthiness scoring
def score_sources(source_name, author_name, language, date_str, sector_relevant, has_sources, independent):
    score = 0

    # 1. Source and authority of the information

    trusted_high = ["UN", "OECD", "ILO", "EU", "Bundesministerium", "Federal Ministry", "University", "Max Planck",
    "UNEP", "UNICEF", "WHO", "World Bank", "Weltbank", "BMZ", "BAFA", "GIZ",
    "Fraunhofer", "Helmholtz", "Amnesty", "Greenpeace", "FAO", "UNDP"]
    trusted_medium = ["NGO", "Fairtrade", "McKinsey", "Oxfam", "Vision",
    "AidEnvironment", "Foodwatch", "Transparency International", "WWF", "Brot für die Welt",
    "Verité", "World Vision", "Bridge Michigan"]
    source = str(source_name)
    if any(key.lower() in source.lower() for key in trusted_high):
        score += 10
    elif any(key.lower() in source.lower() for key in trusted_medium):
        score += 7
    else:
        score += 4

    # 2. Actuality
    try:
        date = datetime.strptime(date_str, "%Y-%m-%d")
        tage_alt = (datetime.today() - date).days
        if tage_alt < 365:
            score += 10
        elif tage_alt < 1095:
            score += 8
        elif tage_alt < 2190:
            score += 6
        else:
            score += 3
    except:
        score += 5  # Fallback

    # 3. Author competence
    competent_authors = ["Dr.", "Prof.", "Expert", "Researcher", "PhD", "M.Sc.", "Msc", "Dipl.-Ing.", "Diplom", "Mag.", "Mba"]
    if any(k.lower() in author_name.lower() for k in competent_authors):
        score += 10
    else:
        score += 6

    # 4. Transparency
    score += 10 if has_sources else 5

    # 5. Language
    if language.lower() == "english":
        score += 10
    elif language.lower() == "deutsch" or language.lower() == "german":
        score += 7
    else:
        score += 5

    # 6. Sector relevance
    score += 10 if sector_relevant else 5

    # 7. Independency
    score += 10 if independent else 5

    # Final score 
    final_score = round(score / 7, 1)  # average score of all 7 criteria
    return final_score

def score_risk(severity, probability, scope, urgency, nace_relevant, company_influence, data_availability, regulation):

    # Scoring scales
    scale = {"HIGH": 10, "MEDIUM": 7, "LOW": 4}
    scope_scale = {"GLOBAL": 10, "REGIONAL": 7, "LOCAL": 4}
    urgency_scale = {"IMMEDIATE": 10, "MEDIUM TERM": 7, "LONG TERM": 4}

    # Highest possible score: 8 × 10 points = 80
    score = 0
    score += scale.get(severity, 5)
    score += scale.get(probability, 5)
    score += scope_scale.get(scope, 5)
    score += urgency_scale.get(urgency, 5)
    score += scale.get(nace_relevant, 5)
    score += scale.get(company_influence, 5)
    score += scale.get(data_availability, 5)
    score += scale.get(regulation, 5)

    # 1-10 scale for final score
    final_score = round((score / 80) * 10, 1)
    return final_score


# Check if entry is already shown in the GUI
def entry_already_in_gui(title, author):
    for child in tree.get_children():
        values = tree.item(child)["values"]
        if len(values) >= 2 and values[0] == title and values[1] == author:
            return True
    return False

def scrape_url_and_process(url):
    try:
        # Load article content using newspaper3k
        article = Article(url, language='de')
        article.download()
        article.parse()

        # Fallbacks via BeautifulSoup
        html = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html, 'html.parser')

        # Extract key data
        title = article.title or fallback_title(soup)
        author = ", ".join(article.authors) if article.authors else extract_meta_author(soup)
        text = article.text or ""

        # Guess risk parameter levels from text using keyword heuristics
        guessed_risk = guess_risk_parameters_from_text(title + " " + text)

        # Prepare risk parameters with guessed values
        risk_params = {
        key: guessed_risk.get(key, "UNKNOWN") for key in [
        "Severity", "Probability", "Scope", "Urgency",
        "NACE_Relevant", "Company_Influence", "Data_Availability", "Regulation"
            ]
        }

        source = article.source_url or get_domain_name(url)
        language = "German"

        # Date fallback
        if article.publish_date:
            date_str = article.publish_date.strftime("%Y-%m-%d")
        else:
            date_str = extract_publish_date(soup) or "2020-01-01"

        # Default values
        sector_relevant = True
        has_sources = True
        independent = True

        # Calculate trust score
        trust = score_sources(source, author, language, date_str, sector_relevant, has_sources, independent)

        # Ask user to confirm or complete the guessed parameters
        manual_result = manual_complete_risk_parameters(risk_params)


        if manual_result:
            severity = manual_result["Severity"]
            probability = manual_result["Probability"]
            scope = manual_result["Scope"]
            urgency = manual_result["Urgency"]
            nace_relevant = manual_result["NACE_Relevant"]
            company_influence = manual_result["Company_Influence"]
            data_availability = manual_result["Data_Availability"]
            regulation = manual_result["Regulation"]
        else:
            severity = risk_params["Severity"]
            probability = risk_params["Probability"]
            scope = risk_params["Scope"]
            urgency = risk_params["Urgency"]
            nace_relevant = risk_params["NACE_Relevant"]
            company_influence = risk_params["Company_Influence"]
            data_availability = risk_params["Data_Availability"]
            regulation = risk_params["Regulation"]


        # Compute risk score
        risk = score_risk(
            severity, probability, scope, urgency, nace_relevant,
            company_influence, data_availability, regulation
        )

        # Combine AI-based and keyword-based NACE assignment

        # 1. Prediction via trained model
        X_new = vectorizer.transform([title + " " + text])
        predicted_nace = nace_model.predict(X_new)[0]

        # 2. Keyword-based NACE matching
        matched_codes = []
        combined_text = f"{title} {text}".lower()

        for code, keywords in nace_keywords.items():
            if any(re.search(rf"\b{re.escape(kw)}\b", combined_text, flags=re.IGNORECASE) for kw in keywords):
                matched_codes.append(code)

        # 3. Merge both sets, remove duplicates
        all_nace_codes = set([predicted_nace] + matched_codes)
        nace_codes = ", ".join(sorted(all_nace_codes)) if all_nace_codes else "Undefined"

        # Timestamp for entry
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")

        return {
            "Title": title,
            "Author": author,
            "NACE_Assignment": nace_codes,
            "Trust_Score": trust,
            "Risk_Score": risk,
            "Timestamp": timestamp
        }

    except Exception as e:
        messagebox.showerror("Scraper Error", f"Failed to extract from URL:\n{e}")
        return None


def scrape_manual_url():
    def analyze():
        url = entry.get()
        result = scrape_url_and_process(url)
        if result and not entry_already_in_gui(result["Title"], result["Author"]):
            tree.insert("", tk.END, values=list(result.values()))
        popup.destroy()

    popup = tk.Toplevel(root)
    popup.title("Enter Article URL")
    popup.geometry("600x150")

    label = tk.Label(popup, text="Please enter the article URL:", font=("Arial", 12))
    label.pack(pady=10)

    entry = tk.Entry(popup, width=80)
    entry.pack(pady=5)

    button = tk.Button(popup, text="Analyze", command=analyze, bg="#4CAF50", fg="white")
    button.pack(pady=10)

def manual_complete_risk_parameters(risk_dict):
    popup = tk.Toplevel(root)
    popup.title("Manual Risk Parameter Completion")
    popup.geometry("420x420")

    label = tk.Label(popup, text="Please confirm or complete the risk parameters:", font=("Arial", 11))
    label.pack(pady=10)

    fields = ["Severity", "Probability", "Scope", "Urgency", "NACE_Relevant",
              "Company_Influence", "Data_Availability", "Regulation"]

    dropdown_values = {
        "Severity": ["HIGH", "MEDIUM", "LOW"],
        "Probability": ["HIGH", "MEDIUM", "LOW"],
        "Scope": ["GLOBAL", "REGIONAL", "LOCAL"],
        "Urgency": ["IMMEDIATE", "MEDIUM TERM", "LONG TERM"],
        "NACE_Relevant": ["HIGH", "MEDIUM", "LOW"],
        "Company_Influence": ["HIGH", "MEDIUM", "LOW"],
        "Data_Availability": ["HIGH", "MEDIUM", "LOW"],
        "Regulation": ["HIGH", "MEDIUM", "LOW"]
    }

    entries = {}

    for field in fields:
        frame = tk.Frame(popup)
        frame.pack(pady=5, padx=15, fill="x")

        guessed_value = risk_dict.get(field, "")
        if guessed_value and guessed_value != "UNKNOWN":
            label_text = f"{field} (Detected: {guessed_value})"
        else:
            label_text = f"{field} (Not detected please select)"

        tk.Label(frame, text=label_text, font=("Arial", 10), width=30, anchor="w").pack(side="left")

        var = tk.StringVar(value=guessed_value if guessed_value != "UNKNOWN" else "")
        dropdown = ttk.Combobox(frame, textvariable=var, values=dropdown_values[field], state="readonly", width=15)
        dropdown.pack(side="left")

        entries[field] = var

    def confirm():
        popup.result = {field: var.get() for field, var in entries.items()}
        popup.destroy()

    def skip():
        popup.result = None
        popup.destroy()

    btn_frame = tk.Frame(popup)
    btn_frame.pack(pady=15)

    tk.Button(btn_frame, text="Confirm", command=confirm, bg="green", fg="white", width=10).pack(side="left", padx=10)
    tk.Button(btn_frame, text="Skip", command=skip, width=10).pack(side="right", padx=10)

    popup.grab_set()
    root.wait_window(popup)

    return popup.result

# Analyze input Excel file
def analyze_file(path):
    try:
        df_input = pd.read_excel(path)
    except Exception as e:
        messagebox.showerror("Error loading file", str(e))
        return

    filename = "nace_source_evaluation.xlsx"
    if os.path.exists(filename):
        df_existing = pd.read_excel(filename)
    else:
        df_existing = pd.DataFrame()

    results = []
    skipped_titles = []

    for _, row in df_input.iterrows():
        title = str(row["Title"])
        author = str(row["Author"])

        if entry_already_in_gui(title, author):
            skipped_titles.append(title)
            continue

        # Combine title and author for better AI prediction (if full text not available)
        X_new = vectorizer.transform([f"{title} {author}"])
        predicted_nace = nace_model.predict(X_new)[0]


        # Keyword-based matching (allows multiple matches)
        matched_codes = []
        combined_text = f"{title} {author}".lower()

        for code, keywords in nace_keywords.items():
            if any(re.search(rf"\b{re.escape(kw)}\b", combined_text, flags=re.IGNORECASE) for kw in keywords):
                matched_codes.append(code)

        # Combine AI and keyword codes (avoid duplicates)
        all_nace_codes = set([predicted_nace] + matched_codes)
        nace_codes = ", ".join(sorted(all_nace_codes)) if all_nace_codes else "Undefined"



        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        source = str(row["Source"])
        language = str(row["Language"])
        date_str = str(row["Date"])
        sector_relevant = bool(row["Sector_Relevant"])
        has_sources = bool(row["Has_Sources"])
        independent = bool(row["Independent"])

        trust = score_sources(
    source, author, language, date_str,
    sector_relevant, has_sources, independent
)
        
        severity = str(row["Severity"])
        probability = str(row["Probability"])
        scope = str(row["Scope"])
        urgency = str(row["Urgency"])
        nace_relevant = str(row["NACE_Relevant"])
        company_influence = str(row["Company_Influence"])
        data_availability = str(row["Data_Availability"])
        regulation = str(row["Regulation"])

        risk = score_risk(severity, probability, scope, urgency,
                  nace_relevant, company_influence, data_availability, regulation)
        
        results.append({
            "Title": title,
            "Author": author,
            "NACE_Assignment": nace_codes,
            "Trust_Score": trust,
            "Risk_Score": risk,
            "Timestamp": timestamp
        })

    if skipped_titles:
        print("Skipped entries:", skipped_titles)
        info_text = "The following titles have already been analyzed and were skipped:\n\n" + "\n".join(skipped_titles)
        messagebox.showinfo("Info", info_text)

    if not results:
        return

    df_output = pd.DataFrame(results)

    if not df_existing.empty:
        df_total = pd.concat([df_existing, df_output], ignore_index=True)
    else:
        df_total = df_output

    df_total.to_excel(filename, index=False)
    return df_output

# Load and process file from GUI
def load_file():
    path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    if not path:
        return

    df = analyze_file(path)
    if df is not None:
        for _, row in df.iterrows():
            title = row["Title"]
            author = row["Author"]
            if not entry_already_in_gui(title, author):
                tree.insert("", tk.END, values=list(row))

        processed_folder = "processed"
        os.makedirs(processed_folder, exist_ok=True)

        filename = os.path.basename(path)
        target_path = os.path.join(processed_folder, filename)

        try:
            shutil.move(path, target_path)
        except Exception as e:
            print(f"Error moving file: {e}")

        messagebox.showinfo("Done", "Analysis completed and saved as 'nace_source_evaluation.xlsx'")

# GUI setup
root = tk.Tk()
root.title("NACE Source Analysis Tool")
root.geometry("1450x750")
root.configure(bg="#f0f4f8")

frame_top = tk.Frame(root, bg="#f0f4f8")
frame_top.pack(pady=20)

lbl_title = tk.Label(frame_top, text="NACE Classification and Trust Evaluation", font=("Arial", 16, "bold"), bg="#f0f4f8")
lbl_title.pack(pady=5)

btn_load = tk.Button(frame_top, text="Load & Analyze Excel File", command=load_file, font=("Arial", 12), bg="#4CAF50", fg="white", padx=10, pady=5)
btn_load.pack()

btn_scrape_url = tk.Button(frame_top, text="Analyze Web Article", command=scrape_manual_url, font=("Arial", 12), bg="#2196F3", fg="white", padx=10, pady=5)
btn_scrape_url.pack(pady=5)

# --- INPUT ---
input_fields = [
    "Title", "Source", "Author", "Language", "Date", "Sector_Relevant",
    "Has_Sources", "Independent", "Severity", "Probability", "Scope",
    "Urgency", "NACE_Relevant", "Company_Influence", "Data_Availability", "Regulation"
]

entry_frame = tk.Frame(root, bg="#f0f4f8")
entry_frame.pack(fill="x", padx=10, pady=10)

entry_widgets = {}

# Dropdown values for specific fields
dropdown_fields = {
    "Severity": ["HIGH", "MEDIUM", "LOW"],
    "Probability": ["HIGH", "MEDIUM", "LOW"],
    "Scope": ["GLOBAL", "REGIONAL", "LOCAL"],
    "Urgency": ["IMMEDIATE", "MEDIUM TERM", "LONG TERM"],
    "Sector_Relevant": ["True", "False"],
    "Has_Sources": ["True", "False"],
    "Independent": ["True", "False"],
    "NACE_Relevant": ["HIGH", "MEDIUM", "LOW"],
    "Company_Influence": ["HIGH", "MEDIUM", "LOW"],
    "Data_Availability": ["HIGH", "MEDIUM", "LOW"],
    "Regulation": ["HIGH", "MEDIUM", "LOW"]
}

# Split fields into two rows
row1_fields = input_fields[:8]
row2_fields = input_fields[8:]

for row_fields in [row1_fields, row2_fields]:
    row = tk.Frame(entry_frame, bg="#f0f4f8")
    row.pack(pady=5)
    for field in row_fields:
        sub = tk.Frame(row, bg="#f0f4f8")
        sub.pack(side="left", padx=5)
        tk.Label(sub, text=f"{field}:", font=("Arial", 11), bg="#f0f4f8").pack(anchor="w")

        if field == "Language":
            language_options = ["English", "German", "Other"]
            var = tk.StringVar()
            dropdown = ttk.Combobox(sub, textvariable=var, values=language_options, width=14, state="readonly")
            dropdown.pack()
            entry_widgets[field] = dropdown

        elif field == "Date":
            from datetime import datetime
            year_options = [str(year) for year in range(datetime.now().year, 2000, -1)]
            var = tk.StringVar()
            dropdown = ttk.Combobox(sub, textvariable=var, values=year_options, width=14, state="readonly")
            dropdown.pack()
            entry_widgets[field] = dropdown

        elif field in dropdown_fields:
            var = tk.StringVar()
            dropdown = ttk.Combobox(sub, textvariable=var, values=dropdown_fields[field], width=14, state="readonly")
            dropdown.pack()
            entry_widgets[field] = dropdown

        else:
            entry = tk.Entry(sub, width=16, font=("Arial", 11))
            entry.pack()
            entry_widgets[field] = entry

# Function to open and display the README.md file in a popup window
def show_readme_popup():
    try:
        with open("README.md", "r", encoding="utf-8") as f:
            content = f.read()
    except Exception as e:
        messagebox.showerror("Error", f"Failed to load the README file:\n{e}")
        return

    popup = tk.Toplevel(root)
    popup.title("Instructions / Usage Guide")
    popup.geometry("900x600")

    # Scrollable Text widget
    text_widget = tk.Text(popup, wrap="word", font=("Courier New", 10))
    text_widget.insert("1.0", content)
    text_widget.config(state="disabled")  # Make it read-only
    text_widget.pack(expand=True, fill="both")

    # Vertical scrollbar
    scrollbar = tk.Scrollbar(popup, command=text_widget.yview)
    scrollbar.pack(side="right", fill="y")
    text_widget.config(yscrollcommand=scrollbar.set)

# Button to open the usage guide / README
btn_readme = tk.Button(
    frame_top,
    text="Open Instructions (README)",
    command=show_readme_popup,
    font=("Arial", 12),
    bg="#FF9800",
    fg="white",
    padx=10,
    pady=5
)
btn_readme.pack(pady=5)


# Manual Entry Function 
def add_manual_entry():
    values = {k: entry_widgets[k].get().strip() for k in input_fields}
    
    # Title and Source are mandatory
    if not values["Title"] or not values["Source"]:
        messagebox.showwarning("Input Error", "Please enter both Title and Source.")
        return

    filename = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    if not filename:
        return

    try:
        df_existing = pd.read_excel(filename)
    except Exception as e:
        messagebox.showerror("Error reading file", str(e))
        return

    if ((df_existing["Title"] == values["Title"]) & (df_existing["Author"] == values["Source"])).any():
        messagebox.showinfo("Duplicate", "This entry already exists in the file.")
        return

    # Combine AI and keyword-based NACE classification for manual entry
    X_new = vectorizer.transform([f"{values['Title']} {values.get('Author', '')}"])
    predicted_nace = nace_model.predict(X_new)[0]

    # NACE Assignment
    matched_codes = []
    for code, keywords in nace_keywords.items():
        if any(re.search(rf"\b{kw}\b", values["Title"], re.IGNORECASE) for kw in keywords):
            matched_codes.append(code)

    all_nace_codes = set([predicted_nace] + matched_codes)
    nace_codes = ", ".join(sorted(all_nace_codes)) if all_nace_codes else "Undefined"


    # Trust Score Calculation
    trust = score_sources(
        values["Source"],
        values.get("Author", ""),
        values.get("Language", ""),
        values.get("Date", ""),
        values.get("Sector_Relevant", "").lower() == "true",
        values.get("Has_Sources", "").lower() == "true",
        values.get("Independent", "").lower() == "true"
    )

    # Risk Score Calculation
    risk = score_risk(
        values.get("Severity", ""), values.get("Probability", ""), values.get("Scope", ""),
        values.get("Urgency", ""), values.get("NACE_Relevant", ""), values.get("Company_Influence", ""),
        values.get("Data_Availability", ""), values.get("Regulation", "")
    )

    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    author = values.get("Author") or values["Source"]

    new_row = {
        "Title": values["Title"],
        "Author": author,
        "NACE_Assignment": nace_codes,
        "Trust_Score": trust,
        "Risk_Score": risk,
        "Timestamp": timestamp
    }

    df_updated = pd.concat([df_existing, pd.DataFrame([new_row])], ignore_index=True)

    try:
        df_updated.to_excel(filename, index=False)
    except Exception as e:
        messagebox.showerror("Error writing to file", str(e))
        return

    if not entry_already_in_gui(values["Title"], values["Source"]):
        tree.insert("", tk.END, values=list(new_row.values()))

    messagebox.showinfo("Success", "Entry added successfully.")

# Button
btn_add_manual = tk.Button(entry_frame, text="Add Entry to File", command=add_manual_entry,
font=("Arial", 12), bg="#2196F3", fg="white", padx=10, pady=5)
btn_add_manual.pack(pady=10)

# Legend Section 
legend_frame = tk.Frame(root, bg="#f0f4f8", pady=10)
legend_frame.pack(fill="x", padx=10)

# Trust Score Legend
trust_label = tk.Label(
    legend_frame,
    text="Trust Score: 10 = High Trust, 1 = Low Trust",
    font=("Arial", 11),
    bg="#f0f4f8",
    fg="#333"
)
trust_label.pack(anchor="w", padx=5)

# Risk Score Legend
risk_label = tk.Label(
    legend_frame,
    text="Risk Score: 10 = High Risk, 1 = Low Risk",
    font=("Arial", 11),
    bg="#f0f4f8",
    fg="#333"
)
risk_label.pack(anchor="w", padx=5)
# END LEGEND 

frame_table = tk.Frame(root)
frame_table.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)

columns = ["Title", "Author", "NACE_Assignment", "Trust_Score", "Risk_Score", "Timestamp"]
tree = ttk.Treeview(frame_table, columns=columns, show="headings", height=20)

# Set column widths and alignment
tree.column("Title", width=450, anchor="w")
tree.column("Author", width=200, anchor="w")
tree.column("NACE_Assignment", width=120, anchor="center")
tree.column("Trust_Score", width=60, anchor="center")
tree.column("Risk_Score", width=60, anchor="center")
tree.column("Timestamp", width=120, anchor="center")

# Enable sorting on column headers
def sort_treeview_column(treeview, col, reverse):
    # Extract data
    data = [(treeview.set(k, col), k) for k in treeview.get_children("")]
    # Try sorting numerically, fallback to text
    try:
        data.sort(key=lambda t: float(t[0]), reverse=reverse)
    except ValueError:
        data.sort(key=lambda t: t[0].lower(), reverse=reverse)
    # Reorder in tree
    for index, (val, k) in enumerate(data):
        treeview.move(k, '', index)
    # Toggle direction
    treeview.heading(col, command=lambda: sort_treeview_column(treeview, col, not reverse))

# Set headings and make them clickable
for col in columns:
    tree.heading(col, text=col, command=lambda _col=col: sort_treeview_column(tree, _col, False))
    tree.column(col, anchor="w", width=230)

style = ttk.Style()
style.configure("Treeview.Heading", font=("Arial", 11, "bold"))
style.configure("Treeview", font=("Arial", 10))

tree.pack(fill=tk.BOTH, expand=True)

# Watchdog handler
class ExcelHandler(FileSystemEventHandler):
    def on_created(self, event):
        if event.src_path.endswith(".xlsx"):
            df = analyze_file(event.src_path)
            if df is not None:
                tree.after(0, update_gui, df)
                processed_folder = "processed"
                os.makedirs(processed_folder, exist_ok=True)
                filename = os.path.basename(event.src_path)
                target_path = os.path.join(processed_folder, filename)
                try:
                    shutil.move(event.src_path, target_path)
                except Exception as e:
                    print(f"Error moving file: {e}")

# Update GUI with new data
def update_gui(df):
    for _, row in df.iterrows():
        title = row["Title"]
        author = row["Author"]
        if not entry_already_in_gui(title, author):
            tree.insert("", tk.END, values=list(row))

# Folder monitoring
def monitor_folder(path):
    os.makedirs(path, exist_ok=True)
    event_handler = ExcelHandler()
    observer = Observer()
    observer.schedule(event_handler, path=path, recursive=False)
    observer.start()

monitor_thread = threading.Thread(
    target=monitor_folder, 
    args=("incoming_sources",), 
    daemon=True
)
monitor_thread.start()

root.mainloop()