In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import io
import os
import time
from deep_translator import GoogleTranslator
from gtts import gTTS
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
import base64
from io import BytesIO

# Download NLTK resources
try:
    nltk.data.find('corpus/stopwords')
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpus/wordnet')
    nltk.data.find('corpus/omw-1.4')
except LookupError:
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# Initialize session state
session_state = {
    "preprocessed_data": None,
    "model": None,
    "vectorizer": None,
    "multilabel_binarizer": None,
    "evaluation_metrics": None,
    "confusion_matrix": None,
    "translated_summaries": {},
    "audio_files": {},
    "top_genres": None,
    "has_trained": False,
    "auto_process": False
}

# Define paths to data files
plot_summaries_path = "attached_assets/plot_summaries.txt"
metadata_path = "attached_assets/movie.metadata.tsv"

# Maximum height limit in pixels (below WebP limit of 16383)
MAX_HEIGHT_PIXELS = 15000

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def load_and_preprocess_data(plot_summaries_path, metadata_path):
    with open(plot_summaries_path, 'r', encoding='utf-8') as f:
        plot_summaries_content = f.read()
    
    plot_summaries_data = []
    for line in plot_summaries_content.strip().split('\n'):
        parts = line.split('\t')
        if len(parts) >= 2:
            movie_id = parts[0]
            summary = parts[1]
            plot_summaries_data.append([movie_id, summary])
    
    df_summaries = pd.DataFrame(plot_summaries_data, columns=['movie_id', 'summary'])
    
    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata_content = f.read()
    
    metadata_data = []
    for line in metadata_content.strip().split('\n'):
        parts = line.split('\t')
        if len(parts) >= 9:
            movie_id = parts[0]
            genres = parts[8].split()
            genres = [g.strip('{}') for g in genres]
            metadata_data.append([movie_id, genres])
    
    df_metadata = pd.DataFrame(metadata_data, columns=['movie_id', 'genres'])
    
    df = pd.merge(df_summaries, df_metadata, on='movie_id')
    df['processed_summary'] = df['summary'].apply(preprocess_text)
    df = df[df['processed_summary'].str.strip() != '']
    df = df[df['genres'].apply(len) > 0]
    
    unique_genres = set()
    for genre_list in df['genres']:
        unique_genres.update(genre_list)
    
    selected_movies = []
    genre_count = {genre: 0 for genre in unique_genres}
    
    for idx, row in df.iterrows():
        movie_id = row['movie_id']
        genres = row['genres']
        needs_more = False
        for genre in genres:
            if genre_count[genre] < 5:
                needs_more = True
                break
        if needs_more:
            selected_movies.append(movie_id)
            for genre in genres:
                genre_count[genre] += 1
    
    if len(selected_movies) < 500:
        remaining = df[~df['movie_id'].isin(selected_movies)]
        additional = min(500 - len(selected_movies), len(remaining))
        additional_ids = remaining.head(additional)['movie_id'].tolist()
        selected_movies.extend(additional_ids)
    
    df = df[df['movie_id'].isin(selected_movies)]
    df_processed = df[['movie_id', 'summary', 'processed_summary', 'genres']]
    
    X = df_processed['processed_summary']
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df_processed['genres'])
    
    vectorizer = TfidfVectorizer(max_features=5000)
    X_tfidf = vectorizer.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
    
    return df_processed, X_train, X_test, y_train, y_test, vectorizer, mlb

def translate_text(text, target_language):
    try:
        translator = GoogleTranslator(source='auto', target=target_language)
        chunks = [text[i:i+4500] for i in range(0, len(text), 4500)]
        translated_chunks = []
        for chunk in chunks:
            time.sleep(0.5)
            translation = translator.translate(chunk)
            translated_chunks.append(translation)
        return ' '.join(translated_chunks)
    except Exception as e:
        return f"Translation error: {str(e)}"

def get_language_name(language_code):
    language_map = {'ar': 'Arabic', 'ur': 'Urdu', 'ko': 'Korean'}
    return language_map.get(language_code, 'Unknown')

def text_to_speech(text, language_code):
    debug_info = f"Input text: {text[:100]}...\nLanguage code: {language_code}\n"
    try:
        if not text or not isinstance(text, str):
            debug_info += "Error: Input text is empty or invalid.\n"
            raise ValueError("Input text is empty or invalid")
        
        language_map = {'ar': 'ar', 'ur': 'ur', 'ko': 'ko'}
        tts_lang = language_map.get(language_code, 'en')
        debug_info += f"Mapped TTS language: {tts_lang}\n"
        
        audio_bytes = io.BytesIO()
        tts = gTTS(text=text, lang=tts_lang)
        tts.write_to_fp(audio_bytes)
        audio_bytes.seek(0)
        audio_data = audio_bytes.getvalue()
        
        debug_info += f"Audio generated successfully, size: {len(audio_data)} bytes\n"
        return audio_data, debug_info
    except Exception as e:
        debug_info += f"Error generating audio: {str(e)}\n"
        error_audio = io.BytesIO()
        error_message = f"Error generating audio: {str(e)}"
        error_tts = gTTS(text=error_message, lang='en')
        error_tts.write_to_fp(error_audio)
        error_audio.seek(0)
        audio_data = error_audio.getvalue()
        debug_info += f"Error audio generated, size: {len(audio_data)} bytes\n"
        return audio_data, debug_info

def train_genre_model(X_train, X_test, y_train, y_test, model_type='Logistic Regression', hyperparams=None):
    if hyperparams is None:
        hyperparams = {}
    
    num_classes = y_train.shape[1]
    active_classes = [np.sum(y_train[:, col] > 0) > 0 for col in range(num_classes)]
    genre_distribution = {i: np.sum(y_train[:, i]) for i in range(num_classes) if active_classes[i]}
    
    debug_info = f"Training with {num_classes} genre classes, {sum(active_classes)} active.\n"
    debug_info += f"Hyperparameters: {hyperparams}\n"
    debug_info += f"Genre distribution: {genre_distribution}\n"
    
    classifiers = []
    for col in range(num_classes):
        if not active_classes[col]:
            classifiers.append(DummyClassifier(strategy='most_frequent'))
        else:
            if model_type == 'Logistic Regression':
                hyperparams.pop('multi_class', None)
                base_model = LogisticRegression(**hyperparams)
            elif model_type == 'Random Forest':
                base_model = RandomForestClassifier(**hyperparams)
            elif model_type == 'Support Vector Machine':
                base_model = SVC(**hyperparams)
            else:
                raise ValueError(f"Unsupported model type: {model_type}")
            classifiers.append(base_model)
    
    model = MultiOutputClassifier(estimator=None)
    model.estimators_ = classifiers
    
    for idx, estimator in enumerate(model.estimators_):
        estimator.fit(X_train, y_train[:, idx])
    
    y_pred = model.predict(X_test)
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision_micro': precision_score(y_test, y_pred, average='micro', zero_division=0),
        'recall_micro': recall_score(y_test, y_pred, average='micro', zero_division=0),
        'f1_micro': f1_score(y_test, y_pred, average='micro', zero_division=0),
        'precision_macro': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'recall_macro': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'f1_macro': f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
    
    y_test_flat = y_test.flatten()
    y_pred_flat = y_pred.flatten()
    conf_matrix = confusion_matrix(y_test_flat, y_pred_flat)
    
    return model, metrics, conf_matrix, debug_info

def predict_genre(text, model, vectorizer, mlb):
    text_features = vectorizer.transform([text])
    genre_predictions = model.predict(text_features)
    predicted_genres = mlb.inverse_transform(genre_predictions)
    return [genre for sublist in predicted_genres for genre in sublist]

def get_top_features(vectorizer, model, genre_idx, n=10):
    try:
        feature_names = vectorizer.get_feature_names_out()
        if isinstance(model.estimators_[genre_idx], DummyClassifier):
            return [("No feature importance available for DummyClassifier", 0)]
        if hasattr(model.estimators_[genre_idx], 'feature_importances_'):
            coef = model.estimators_[genre_idx].feature_importances_
        elif hasattr(model.estimators_[genre_idx], 'coef_'):
            coef = model.estimators_[genre_idx].coef_[0]
        else:
            return [("No feature importance available for this model type", 0)]
        feature_coefs = list(zip(feature_names, coef))
        feature_coefs.sort(key=lambda x: abs(x[1]), reverse=True)
        return feature_coefs[:n]
    except Exception as e:
        return [("Error getting features", 0)]

def display_metrics(metrics):
    return f"""
    Accuracy: {metrics['accuracy']:.4f}
    Precision (micro): {metrics['precision_micro']:.4f}
    Recall (micro): {metrics['recall_micro']:.4f}
    F1 Score (micro): {metrics['f1_micro']:.4f}
    Precision (macro): {metrics['precision_macro']:.4f}
    Recall (macro): {metrics['recall_macro']:.4f}
    F1 Score (macro): {metrics['f1_macro']:.4f}
    """

def plot_confusion_matrix(conf_matrix):
    debug_info = f"Confusion matrix shape: {conf_matrix.shape}\nValues:\n{conf_matrix}\n"
    try:
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax, annot_kws={"size": 12})
        ax.set_title("Confusion Matrix (Multi-label)")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
        plt.tight_layout()
        height_in_pixels = fig.get_figheight() * fig.dpi
        debug_info += f"Figure height: {height_in_pixels} pixels\n"
        return fig, debug_info
    except Exception as e:
        debug_info += f"Error plotting confusion matrix: {str(e)}\n"
        return None, debug_info

def plot_genre_distribution(genres):
    genre_counts = pd.Series(genres).value_counts()
    debug_info = f"Genre counts:\n{genre_counts.to_dict()}\n"
    try:
        num_items = len(genre_counts)
        base_height = max(6, num_items * 0.3)
        max_height = min(base_height, MAX_HEIGHT_PIXELS / 100)  # Convert to inches (assuming 100 dpi)
        fig, ax = plt.subplots(figsize=(12, max_height))
        sns.barplot(x=genre_counts.values, y=genre_counts.index, ax=ax)
        ax.set_title("Genre Distribution")
        ax.set_xlabel("Count")
        ax.set_ylabel("Genre")
        ax.tick_params(axis='y', labelsize=10 if num_items <= 50 else 8)
        plt.tight_layout()
        height_in_pixels = fig.get_figheight() * fig.dpi
        debug_info += f"Number of genres: {num_items}\nFigure height: {height_in_pixels} pixels\n"
        if height_in_pixels > MAX_HEIGHT_PIXELS:
            debug_info += "Warning: Figure height exceeds WebP limit, capped to maximum allowed.\n"
        return fig, debug_info
    except Exception as e:
        debug_info += f"Error plotting genre distribution: {str(e)}\n"
        return None, debug_info

def auto_process_data():
    if os.path.exists(plot_summaries_path) and os.path.exists(metadata_path) and not session_state["has_trained"] and not session_state["auto_process"]:
        session_state["auto_process"] = True
        df_processed, X_train, X_test, y_train, y_test, vectorizer, mlb = load_and_preprocess_data(
            plot_summaries_path, metadata_path
        )
        session_state["preprocessed_data"] = {
            "df_processed": df_processed,
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test,
        }
        session_state["vectorizer"] = vectorizer
        session_state["multilabel_binarizer"] = mlb
        model, evaluation_metrics, conf_matrix, debug_info = train_genre_model(
            X_train, X_test, y_train, y_test, "Logistic Regression", {"C": 1.0, "max_iter": 100, "solver": "liblinear"}
        )
        session_state["model"] = model
        session_state["evaluation_metrics"] = evaluation_metrics
        session_state["confusion_matrix"] = conf_matrix
        session_state["has_trained"] = True
        session_state["top_genres"] = mlb.classes_
        return f"Data processing and model training completed successfully!\n{debug_info}"
    return "Data already processed or files not found."

def welcome_page():
    welcome_text = """
    ## Overview
    Welcome to Filmception - an AI-powered system for processing movie summaries, predicting movie genres, 
    and converting movie summaries into audio formats in multiple languages.
    
    ### Features:
    - Automatically preprocess and clean movie summaries from the CMU Movie Summary dataset
    - Translate movie summaries into multiple languages (Arabic, Urdu, Korean)
    - Convert translated summaries to audio
    - Predict movie genres based on summaries using machine learning
    
    ### How to use:
    1. **Model Training**: Train a machine learning model to predict movie genres
    2. **Explore Features**: Analyze the model features and performance
    3. **Use the App**: Input your own movie summary and explore the features
    4. **Project Report**: View detailed information about the project
    
    The data is automatically processed when you start the application!
    """
    if session_state["preprocessed_data"] is not None:
        df = session_state["preprocessed_data"]["df_processed"]
        all_genres = []
        for genres in df['genres']:
            all_genres.extend(genres)
        metrics = f"""
        Total Movies: {len(df)}
        Unique Genres: {len(set(all_genres))}
        Avg. Genres per Movie: {round(len(all_genres) / len(df), 2)}
        """
        fig, plot_debug = plot_genre_distribution(all_genres)
        status = "✅ Data processing and model training completed successfully!" if session_state["has_trained"] else "Data processing completed. Train a model in the 'Model Training' tab."
        return welcome_text, metrics, fig, status, plot_debug
    return welcome_text, "Data processing in progress...", None, "Processing data...", "No plot data yet."

def train_model_page(c_value, max_iter, solver):
    if session_state["preprocessed_data"] is None:
        return "Please complete data preprocessing first.", None, None, None, "No plot data yet."
    
    hyperparams = {"C": c_value, "max_iter": int(max_iter), "solver": solver}
    X_train = session_state["preprocessed_data"]["X_train"]
    X_test = session_state["preprocessed_data"]["X_test"]
    y_train = session_state["preprocessed_data"]["y_train"]
    y_test = session_state["preprocessed_data"]["y_test"]
    mlb = session_state["multilabel_binarizer"]
    
    model, evaluation_metrics, conf_matrix, debug_info = train_genre_model(
        X_train, X_test, y_train, y_test, "Logistic Regression", hyperparams
    )
    
    session_state["model"] = model
    session_state["evaluation_metrics"] = evaluation_metrics
    session_state["confusion_matrix"] = conf_matrix
    session_state["has_trained"] = True
    session_state["top_genres"] = mlb.classes_
    
    fig, plot_debug = plot_confusion_matrix(conf_matrix)
    return "Model training completed!", display_metrics(evaluation_metrics), fig, debug_info, plot_debug

def explore_features_page(selected_genre):
    if not session_state["has_trained"]:
        return "Please train a model first.", None, None, "No plot data yet."
    
    vectorizer = session_state["vectorizer"]
    model = session_state["model"]
    top_genres = session_state["top_genres"]
    
    debug_info = f"Top genres: {top_genres}\nSelected genre: {selected_genre}"
    
    if top_genres is None or len(top_genres) == 0:
        return f"No genres available. Please train a model first.\n{debug_info}", None, None, "No plot data yet."
    if selected_genre is None or selected_genre not in top_genres:
        return f"Invalid or no genre selected. Please select a valid genre.\n{debug_info}", None, None, "No plot data yet."
    
    genre_idx = np.where(top_genres == selected_genre)[0][0]
    top_words = get_top_features(vectorizer, model, genre_idx, n=20)
    
    if len(top_words) == 1 and top_words[0][0].startswith("No feature"):
        return f"{top_words[0][0]}\n{debug_info}", None, None, "No plot data due to feature extraction error."
    
    top_words_sorted = sorted(top_words, key=lambda x: x[1])
    words = [word for word, coef in top_words_sorted]
    coefs = [coef for word, coef in top_words_sorted]
    
    plot_debug = f"Top words: {words}\nCoefficients: {coefs}\n"
    try:
        fig, ax = plt.subplots(figsize=(12, 8))
        sns.barplot(x=coefs, y=words, hue=words, palette="viridis", ax=ax, legend=False)
        ax.set_title(f"Top 20 Words Associated with {selected_genre}")
        ax.set_xlabel("Coefficient Value")
        ax.tick_params(axis='y', labelsize=8)
        plt.tight_layout()
        height_in_pixels = fig.get_figheight() * fig.dpi
        plot_debug += f"Figure height: {height_in_pixels} pixels\n"
    except Exception as e:
        plot_debug += f"Error plotting top words: {str(e)}\n"
        return f"Error plotting top words.\n{debug_info}", None, None, plot_debug
    
    df = session_state["preprocessed_data"]["df_processed"]
    sample_idx = np.random.randint(0, len(df))
    sample = df.iloc[sample_idx]
    preprocessed_text = preprocess_text(sample['summary'])
    predicted_genres = predict_genre(
        preprocessed_text, model, vectorizer, session_state["multilabel_binarizer"]
    )
    
    sample_text = f"""
    **Movie Summary:** {sample['summary']}
    **Actual Genres:** {', '.join(sample['genres'])}
    **Predicted Genres:** {', '.join(predicted_genres)}
    """
    
    return f"Top words for {selected_genre}\n{debug_info}", fig, sample_text, plot_debug

def use_app_page(summary, action, target_language):
    if not summary:
        return "Please enter a movie summary.", None, None, None, "No plot data due to empty summary."
    
    preprocessed_input = preprocess_text(summary)
    
    if action == "Predict Genre":
        if not session_state["has_trained"]:
            return "Please train a model first.", None, None, None, "No plot data due to untrained model."
        predicted_genres = predict_genre(
            preprocessed_input, session_state["model"], session_state["vectorizer"], session_state["multilabel_binarizer"]
        )
        plot_debug = f"Predicted genres: {predicted_genres}\n"
        try:
            if not predicted_genres:
                plot_debug += "No genres predicted.\n"
                fig, ax = plt.subplots(figsize=(8, 4))
                ax.text(0.5, 0.5, "No genres predicted", ha='center', va='center')
                ax.set_title("Predicted Movie Genres")
                ax.axis('off')
            else:
                num_items = len(predicted_genres)
                base_height = max(4, num_items * 0.5)
                max_height = min(base_height, MAX_HEIGHT_PIXELS / 100)  # Convert to inches (assuming 100 dpi)
                fig, ax = plt.subplots(figsize=(8, max_height))
                ax.barh(range(num_items), [1] * num_items, height=0.5, color='skyblue')
                ax.set_yticks(range(num_items))
                ax.set_yticklabels(predicted_genres, fontsize=10)
                ax.set_title("Predicted Movie Genres")
                ax.set_xlabel("Confidence")
                ax.set_xlim(0, 1.5)
                plt.tight_layout()
            height_in_pixels = fig.get_figheight() * fig.dpi
            plot_debug += f"Number of genres: {num_items}\nFigure height: {height_in_pixels} pixels\n"
            if height_in_pixels > MAX_HEIGHT_PIXELS:
                plot_debug += "Warning: Figure height exceeds WebP limit, capped to maximum allowed.\n"
        except Exception as e:
            plot_debug += f"Error plotting predicted genres: {str(e)}\n"
            return "Error plotting predicted genres.", None, None, None, plot_debug
        return f"Predicted Genres: {', '.join(predicted_genres)}", fig, None, "No audio for genre prediction.", plot_debug
    
    elif action == "Translate & Audio":
        language_codes = {"Arabic": "ar", "Urdu": "ur", "Korean": "ko"}
        language_code = language_codes[target_language]
        
        debug_info = f"Translating to {target_language} (code: {language_code})\n"
        
        if summary not in session_state["translated_summaries"] or language_code not in session_state["translated_summaries"][summary]:
            translated_text = translate_text(summary, language_code)
            session_state["translated_summaries"].setdefault(summary, {})[language_code] = translated_text
        else:
            translated_text = session_state["translated_summaries"][summary][language_code]
        
        debug_info += f"Translated text: {translated_text[:100]}...\n"
        
        if "Translation error" in translated_text:
            return translated_text, None, None, debug_info, "No plot for translation action."
        
        # Clear audio cache to force fresh generation
        key = f"{summary}_{language_code}"
        session_state["audio_files"].pop(key, None)
        
        audio_bytes, audio_debug = text_to_speech(translated_text, language_code)
        session_state["audio_files"][key] = audio_bytes
        debug_info += audio_debug
        
        return f"Translation ({target_language}): {translated_text}", None, audio_bytes, debug_info, "No plot for translation action."

def project_report_page():
    report_text = """
    ## Filmception Project Report
    
    ### Project Overview
    Filmception is an AI-powered tool that combines natural language processing, machine learning, and audio generation 
    to provide a comprehensive solution for movie summary analysis. It automatically processes movie data from the 
    CMU Movie Summary Corpus, extracts patterns and insights from movie plots, and offers multilingual support 
    for translations and audio generation.
    
    ### Core Components
    1. **Data Processing Pipeline**
       - Automatic loading of CMU Movie Summary dataset files
       - Text preprocessing with efficient tokenization and stop-word removal
       - Multi-label genre encoding for machine learning compatibility
    2. **Machine Learning Model**
       - Genre prediction using multi-label classification
       - Support for multiple model types (Logistic Regression, Random Forest, SVM)
       - Feature importance analysis to understand genre-specific keywords
    3. **Multilingual Support**
       - Translation to multiple languages (Arabic, Urdu, Korean)
       - Text-to-speech conversion for accessibility
       - Audio file generation and download
    4. **User Interface**
       - Interactive web application built with Gradio
       - Dynamic visualization of model metrics and features
       - Seamless user experience with automatic data processing
    """
    if session_state["preprocessed_data"] is not None:
        df = session_state["preprocessed_data"]["df_processed"]
        all_genres = []
        for genres in df['genres']:
            all_genres.extend(genres)
        fig, plot_debug = plot_genre_distribution(all_genres)
        metrics = f"""
        Total Movies: {len(df)}
        Unique Genres: {len(set(all_genres))}
        Avg. Genres per Movie: {round(len(all_genres) / len(df), 2)}
        """
        return report_text, fig, metrics, plot_debug
    return report_text, None, "No data available.", "No plot data yet."

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🎬 Filmception")
    gr.Markdown("AI-powered Multilingual movie summary translator and genre classifier")
    
    with gr.Tabs():
        with gr.TabItem("Welcome"):
            welcome_text = gr.Markdown()
            metrics_output = gr.Textbox(label="Dataset Statistics")
            status_output = gr.Textbox(label="Status")
            genre_plot = gr.Plot()
            plot_debug_welcome = gr.Textbox(label="Plot Debug Info")
            
            gr.Button("Load Data").click(
                fn=auto_process_data,
                outputs=status_output
            ).then(
                fn=welcome_page,
                outputs=[welcome_text, metrics_output, genre_plot, status_output, plot_debug_welcome]
            )
        
        with gr.TabItem("Model Training"):
            c_value = gr.Slider(0.1, 10.0, value=1.0, step=0.1, label="C (Regularization parameter)")
            max_iter = gr.Slider(100, 1000, value=100, step=50, label="Maximum Iterations")
            solver = gr.Dropdown(["liblinear", "saga"], label="Solver", value="liblinear")
            train_button = gr.Button("Train Model")
            train_status = gr.Textbox(label="Training Status")
            metrics_output = gr.Textbox(label="Evaluation Metrics")
            conf_matrix_plot = gr.Plot()
            debug_output = gr.Textbox(label="Debug Info")
            plot_debug_train = gr.Textbox(label="Plot Debug Info")
            
            train_button.click(
                fn=train_model_page,
                inputs=[c_value, max_iter, solver],
                outputs=[train_status, metrics_output, conf_matrix_plot, debug_output, plot_debug_train]
            )
        
        with gr.TabItem("Explore Features"):
            genre_selector = gr.Dropdown(choices=[], label="Select Genre", value=None)
            explore_status = gr.Textbox(label="Feature Analysis")
            feature_plot = gr.Plot()
            sample_output = gr.Markdown(label="Sample Prediction")
            plot_debug_explore = gr.Textbox(label="Plot Debug Info")
            
            def update_genre_dropdown():
                if session_state["top_genres"] is not None and len(session_state["top_genres"]) > 0:
                    return gr.update(choices=list(session_state["top_genres"]), value=session_state["top_genres"][0])
                return gr.update(choices=[], value=None)
            
            gr.Button("Refresh Genres").click(
                fn=update_genre_dropdown,
                outputs=genre_selector
            ).then(
                fn=explore_features_page,
                inputs=genre_selector,
                outputs=[explore_status, feature_plot, sample_output, plot_debug_explore]
            )
        
        with gr.TabItem("Use the App"):
            summary_input = gr.Textbox(lines=5, label="Enter a Movie Summary")
            action_selector = gr.Dropdown(["Predict Genre", "Translate & Audio"], label="Action")
            language_selector = gr.Dropdown(["Arabic", "Urdu", "Korean"], label="Target Language")
            app_button = gr.Button("Process")
            app_status = gr.Textbox(label="Result")
            app_plot = gr.Plot()
            audio_output = gr.Audio(label="Audio Playback")
            audio_debug = gr.Textbox(label="Audio Debug Info")
            plot_debug_app = gr.Textbox(label="Plot Debug Info")
            
            app_button.click(
                fn=use_app_page,
                inputs=[summary_input, action_selector, language_selector],
                outputs=[app_status, app_plot, audio_output, audio_debug, plot_debug_app]
            )
        
        with gr.TabItem("Project Report"):
            report_text = gr.Markdown()
            report_plot = gr.Plot()
            report_metrics = gr.Textbox(label="Dataset Statistics")
            plot_debug_report = gr.Textbox(label="Plot Debug Info")
            
            gr.Button("Load Report").click(
                fn=project_report_page,
                outputs=[report_text, report_plot, report_metrics, plot_debug_report]
            )

demo.launch()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hassa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hassa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hassa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hassa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


