In [9]:
import tkinter as tk
from tkinterdnd2 import TkinterDnD, DND_FILES
import os
from datetime import datetime
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import joblib
import numpy as np
import json
import sys
from transformers import AutoTokenizer
import pickle

# Check if the script is running in an interactive environment (e.g., Jupyter)
if getattr(sys, 'frozen', False):  # Running as an executable (frozen app)
    application_path = os.path.dirname(sys.executable)
elif '__file__' in globals():  # Running as a script
    application_path = os.path.dirname(__file__)
else:  # Interactive environment, set to current working directory
    application_path = os.getcwd()

# Adjust the paths to your resources relative to the application path
model_path = os.path.join(application_path, "production_model", "model.h5")
tokenizer_file = os.path.join(application_path, "production_model", "tokenizer.pkl")
label_encoder_file = os.path.join(application_path, "production_model", "labels.pkl")
config_file = os.path.join(application_path, "production_model", "config.json")



# Paths for loading saved components
model_path = "production_model/model.h5"
tokenizer_file = "production_model/tokenizer.pkl"
label_encoder_file = "production_model/labels.pkl"
config_file = "production_model/config.json"

# Global dataframe to store file metadata
columns = [
    "Name", "File extension", "Size (MB)", "Created by", 
    "Last modified by", "Input", "Predicted Directory"
]
data = []

# Load the trained model
if os.path.exists(model_path):
    print("Loading the trained model...")
    model = load_model(model_path)
else:
    print("Model not found. Please train and save the model.")
    exit()

# Define the path to the saved tokenizer
folder_path = "production_model"  # Make sure this is the correct path to the folder

# Define the path to the saved tokenizer
tokenizer_file = os.path.join(folder_path, "tokenizer.pkl")

try:
    with open(tokenizer_file, 'rb') as f:
        tokenizer = pickle.load(f)
    print("Tokenizer loaded successfully.")
except FileNotFoundError:
    print(f"Tokenizer file not found at {tokenizer_file}.")
    exit()
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    exit()

# Load the LabelEncoder
if os.path.exists(label_encoder_file):
    print("Loading the LabelEncoder...")
    label_encoder = joblib.load(label_encoder_file)
else:
    print("LabelEncoder not found. Please save the LabelEncoder.")
    exit()

# Load training configuration
if os.path.exists(config_file):
    print("Loading the training configuration...")
    with open(config_file, "r") as f:
        config = json.load(f)
        max_sequence_length = config["max_sequence_length"]
else:
    print("Configuration file not found. Please save the configuration.")
    exit()

def extract_file_metadata(file_path, custom_size=0.67, custom_created_by="AlphaBeta", custom_last_modified_by="AlphaBeta"):
    """Extract metadata for a given file, with optional custom values for a test."""
    try:
        # Extract basic file info
        file_name = os.path.basename(file_path)
        file_extension = os.path.splitext(file_name)[1][1:]  # Remove leading dot
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
        created_time = os.path.getctime(file_path)
        created_date = datetime.fromtimestamp(created_time).strftime('%Y-%m-%d')  # Date only

        # Use custom values if provided, otherwise extract from the file
        created_by = custom_created_by if custom_created_by else ""  # Leave empty if not provided
        last_modified_by = custom_last_modified_by if custom_last_modified_by else ""  # Leave empty if not provided
        file_size = custom_size if custom_size else file_size  # Override file size if custom size is provided

        # Construct the input column text
        input_text = (
            f"Name: {file_name}, File extension: {file_extension}, "
            f"Size (MB): {round(file_size, 2)}, Created by: {created_by}, "
            f"Last modified by: {last_modified_by}, Created date: {created_date}"
        )

        print(f"Extracted metadata for {file_name}: {input_text}")
        
        return {
            "Name": file_name,
            "File extension": file_extension,
            "Size (MB)": round(file_size, 2),
            "Created by": created_by,
            "Last modified by": last_modified_by,
            "Input": input_text
        }
    except Exception as e:
        print(f"Error extracting metadata for {file_path}: {e}")
        return {"Error": str(e)}


def predict_directory(metadata):
    """Predict the target directory path using the model."""
    try:
        # Extract the 'Input' feature and preprocess it
        input_text = metadata.get('Input', '')  # Safely get the input, defaulting to empty string if None
        
        if input_text is None or input_text == "":
            return "Error: Input data is missing or invalid"
        
        # Tokenize and pad the input text
        tokenized_text = tokenizer.texts_to_sequences([input_text])
        padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)
        
        # Predict the directory
        predictions = model.predict(padded_text)
        predicted_label = label_encoder.inverse_transform([np.argmax(predictions)])
        return predicted_label[0]
    except Exception as e:
        return f"Error in prediction: {str(e)}"

def handle_drop(event):
    """Handle files dropped onto the window."""
    global data
    file_paths = event.data.strip().splitlines()
    output_text.delete("1.0", tk.END)  # Clear previous text
    
    for file_path in file_paths:
        file_path = file_path.strip('{}')  # Remove any enclosing braces added by TkinterDnD
        metadata = extract_file_metadata(file_path)
        
        if "Error" in metadata:
            output_text.insert(tk.END, f"Error processing {file_path}: {metadata['Error']}\n")
            continue
        
        # Keep "Input" for prediction purposes but remove it from the display
        input_text = metadata["Input"]  # Store "Input" for prediction
        metadata_display = {key: value if value is not None else "" for key, value in metadata.items() if key != "Input"}  # Handle None values
        
        # Predict directory for the file using the "Input" field
        predicted_directory = predict_directory(metadata)
        metadata_display["Predicted Directory"] = predicted_directory if predicted_directory else "Unknown"
        
        data.append(metadata_display)
        
        # Calculate the longest line in the metadata (excluding "Input")
        max_line_length = max(len(f"{key}: {value}") for key, value in metadata_display.items())
        separator = "=" * max_line_length  # Create a separator of the appropriate length
        
        # Display extracted metadata in the desired format (without "Input")
        output_text.insert(tk.END, "Metadata:\n")
        output_text.insert(tk.END, f"{separator}\n")
        for key, value in metadata_display.items():
            output_text.insert(tk.END, f"{key}: {value}\n")
        output_text.insert(tk.END, f"{separator}\n")
    
    # Update the dataframe
    update_dataframe()

def update_dataframe():
    """Update the dataframe with the collected data and display it."""
    df = pd.DataFrame(data, columns=columns)
    print(df)  # Output to the console for verification
    # Optionally save to CSV
    # df.to_csv("file_metadata_with_predictions.csv", index=False)

if __name__ == "__main__":
    # Initialize the TkinterDnD main window
    root = TkinterDnD.Tk()  # Use TkinterDnD for drag-and-drop
    root.title("Drag and Drop Files for Metadata")
    root.geometry("1200x400")

    # Label for instructions
    label = tk.Label(root, text="Drag and Drop Your Files Here", font=("Helvetica", 14))
    label.pack(pady=10)

    # Create a text box to display file metadata
    output_text = tk.Text(root, height=15, wrap=tk.WORD)
    output_text.pack(padx=10, pady=10, fill=tk.BOTH, expand=True)

    # Bind drag-and-drop event to the text box
    root.drop_target_register(DND_FILES)
    root.dnd_bind("<<Drop>>", handle_drop)

    # Start the application
    root.mainloop()




Loading the trained model...
Tokenizer loaded successfully.
Loading the LabelEncoder...
Loading the training configuration...
Extracted metadata for 37_2018_Khosla_Ventures_Seed_L.P._-_Audited_Financial_Statements.pdf: Name: 37_2018_Khosla_Ventures_Seed_L.P._-_Audited_Financial_Statements.pdf, File extension: pdf, Size (MB): 0.67, Created by: AlphaBeta, Last modified by: AlphaBeta, Created date: 2024-11-25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
                                                Name File extension  \
0  37_2018_Khosla_Ventures_Seed_L.P._-_Audited_Fi...            pdf   

   Size (MB) Created by Last modified by  Input  \
0       0.67  AlphaBeta        AlphaBeta    NaN   

                                 Predicted Directory  
0  FundDocumentation > Shared Documents > a. Due ...  
