In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile
import os

# Assuming your zip file is in 'MyDrive/train_empyema.zip'
# Replace with the correct path if it's different
zip_path = '/content/drive/MyDrive/MovieSummaries.zip'
extract_path = '/content/MovieSummaries' # specify the destination path

# Check if the zip file exists
if os.path.exists(zip_path):
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
    print(f"Successfully unzipped '{zip_path}' to '{extract_path}'")
else:
  print(f"Error: Zip file not found at '{zip_path}'")

Successfully unzipped '/content/drive/MyDrive/MovieSummaries.zip' to '/content/MovieSummaries'


In [3]:
import pandas as pd

# Replace with your actual file path
file_path = '/content/MovieSummaries/MovieSummaries/plot_summaries.txt'

# Read the plot summaries file (tab-separated)
df = pd.read_csv(file_path, sep='\t', header=None, names=['movie_id', 'summary'])

# Preview first 5 rows
df.head()


Unnamed: 0,movie_id,summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [4]:
import re
import string

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply the cleaning function
df['clean_summary'] = df['summary'].astype(str).apply(clean_text)

# Preview cleaned summaries
df[['summary', 'clean_summary']].head()

Unnamed: 0,summary,clean_summary
0,"Shlykov, a hard-working taxi driver and Lyosha...",shlykov a hardworking taxi driver and lyosha a...
1,The nation of Panem consists of a wealthy Capi...,the nation of panem consists of a wealthy capi...
2,Poovalli Induchoodan is sentenced for six yea...,poovalli induchoodan is sentenced for six year...
3,"The Lemon Drop Kid , a New York City swindler,...",the lemon drop kid a new york city swindler is...
4,Seventh-day Adventist Church pastor Michael Ch...,seventhday adventist church pastor michael cha...


In [5]:
# Use spaCy's pipe for faster batch processing
import spacy

# Load NLP model
nlp = spacy.load("en_core_web_sm")

texts = df['clean_summary'].astype(str).tolist()
processed = []

for doc in nlp.pipe(texts, batch_size=50):
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    processed.append(' '.join(tokens))

# Assign to DataFrame
df['processed_summary'] = processed

# Preview results
df[['clean_summary', 'processed_summary']].head()

Unnamed: 0,clean_summary,processed_summary
0,shlykov a hardworking taxi driver and lyosha a...,shlykov hardworke taxi driver lyosha saxophoni...
1,the nation of panem consists of a wealthy capi...,nation panem consist wealthy capitol poor dist...
2,poovalli induchoodan is sentenced for six year...,poovalli induchoodan sentence year prison life...
3,the lemon drop kid a new york city swindler is...,lemon drop kid new york city swindler illegall...
4,seventhday adventist church pastor michael cha...,seventhday adventist church pastor michael cha...


In [6]:
# Save cleaned + lemmatized summaries to drive
df[['movie_id', 'processed_summary']].to_csv('/content/MovieSummaries/MovieSummaries/processed_summaries.csv', index=False)


In [7]:
import pandas as pd

# Path to your metadata file on Drive
metadata_path = '/content/MovieSummaries/MovieSummaries/movie.metadata.tsv'

# Load file (tab-separated, no headers)
metadata = pd.read_csv(metadata_path, sep='\t', header=None)

# Preview shape and some values
metadata.shape, metadata.head()


((81741, 9),
           0           1                                                  2  \
 0    975900   /m/03vyhn                                     Ghosts of Mars   
 1   3196793   /m/08yl5d  Getting Away with Murder: The JonBenét Ramsey ...   
 2  28463795  /m/0crgdbh                                        Brun bitter   
 3   9363483  /m/0285_cd                                   White Of The Eye   
 4    261236   /m/01mrr1                                  A Woman in Flames   
 
             3           4      5                                   6  \
 0  2001-08-24  14010832.0   98.0  {"/m/02h40lc": "English Language"}   
 1  2000-02-16         NaN   95.0  {"/m/02h40lc": "English Language"}   
 2        1988         NaN   83.0  {"/m/05f_3": "Norwegian Language"}   
 3        1987         NaN  110.0  {"/m/02h40lc": "English Language"}   
 4        1983         NaN  106.0   {"/m/04306rv": "German Language"}   
 
                                            7  \
 0  {"/m/09c7w0": "Uni

In [8]:
# Column 0 = movie_id, Column 8 = genres (in JSON string format)
metadata = metadata[[0, 8]]
metadata.columns = ['movie_id', 'genres']

In [9]:
import ast

# Convert JSON string to dictionary safely
metadata['genres'] = metadata['genres'].apply(lambda x: ast.literal_eval(str(x)) if pd.notna(x) else {})

# Extract genre values and join as comma-separated string
def extract_genres(genre_dict):
    return ', '.join(list(genre_dict.values()))

metadata['genre_list'] = metadata['genres'].apply(extract_genres)

# Preview extracted genres
metadata[['movie_id', 'genre_list']].head()

Unnamed: 0,movie_id,genre_list
0,975900,"Thriller, Science Fiction, Horror, Adventure, ..."
1,3196793,"Mystery, Biographical film, Drama, Crime Drama"
2,28463795,"Crime Fiction, Drama"
3,9363483,"Thriller, Erotic thriller, Psychological thriller"
4,261236,Drama


In [10]:
summaries = pd.read_csv('/content/MovieSummaries/MovieSummaries/processed_summaries.csv')


In [11]:
# Make sure movie_id types match
summaries['movie_id'] = summaries['movie_id'].astype(str)
metadata['movie_id'] = metadata['movie_id'].astype(str)

# Merge
merged_df = pd.merge(summaries, metadata[['movie_id', 'genre_list']], on='movie_id')

# Preview result
merged_df.head()

Unnamed: 0,movie_id,processed_summary,genre_list
0,23890098,shlykov hardworke taxi driver lyosha saxophoni...,"Drama, World cinema"
1,31186339,nation panem consist wealthy capitol poor dist...,"Action/Adventure, Science Fiction, Action, Drama"
2,20663735,poovalli induchoodan sentence year prison life...,"Musical, Action, Drama, Bollywood"
3,2231378,lemon drop kid new york city swindler illegall...,"Screwball comedy, Comedy"
4,595909,seventhday adventist church pastor michael cha...,"Crime Fiction, Drama, Docudrama, World cinema,..."


In [12]:
merged_df.to_csv('/content/MovieSummaries/MovieSummaries/final_dataset.csv', index=False)


In [13]:
from transformers import MarianMTModel, MarianTokenizer

# MarianMT models for translation
lang_models = {
    'arabic': 'Helsinki-NLP/opus-mt-en-ar',
    'urdu': 'Helsinki-NLP/opus-mt-en-ur',
    'french': 'Helsinki-NLP/opus-mt-en-fr'
}

# gTTS language codes
gtts_map = {
    'arabic': 'ar',
    'urdu': 'ur',
    'french': 'fr'
}


# Load model/tokenizer for a given language
def load_model(lang):
    model_name = lang_models[lang]
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

In [14]:
import torch

def translate(text, tokenizer, model):
    batch = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
    gen = model.generate(**batch)
    translated = tokenizer.batch_decode(gen, skip_special_tokens=True)
    return translated[0]

In [15]:
# Load data
df = pd.read_csv('/content/MovieSummaries/MovieSummaries/final_dataset.csv')

# Take only first 50 rows
df = df.head(50)

# Arabic translation
tokenizer_ar, model_ar = load_model('arabic')

df['summary_arabic'] = df['processed_summary'].apply(lambda x: translate(x, tokenizer_ar, model_ar))
df[['processed_summary', 'summary_arabic']].head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Unnamed: 0,processed_summary,summary_arabic
0,shlykov hardworke taxi driver lyosha saxophoni...,سائق تاكسي سائق تاكسي سائقة تاكسي سائقة تاكسي ...
1,nation panem consist wealthy capitol poor dist...,(ب) بَعْدَ ذلكَ، بَعْدَ ذلكَ بَعْدَ ذلكَ، بَعْ...
2,poovalli induchoodan sentence year prison life...,- - - - - - - - - - - - - - - - - - - - - - - ...
3,lemon drop kid new york city swindler illegall...,(ب) ((((((((((((((((((((((((((((((((((((((((((...
4,seventhday adventist church pastor michael cha...,-الموسم السادس عشر -الحلقة الثالثة - الحلقة ال...


In [16]:
# Urdu translation
tokenizer_ur, model_ur = load_model('urdu')
df['summary_urdu'] = df['processed_summary'].apply(lambda x: translate(x, tokenizer_ur, model_ur))
df[['processed_summary', 'summary_urdu']].head()

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

Unnamed: 0,processed_summary,summary_urdu
0,shlykov hardworke taxi driver lyosha saxophoni...,تعصّب کے باوجود تعصب سے تعلق رکھنے والے لوگوں ...
1,nation panem consist wealthy capitol poor dist...,اِس کی وجہ یہ ہے کہ اِس کی وجہ سے اُنہیں اِس ب...
2,poovalli induchoodan sentence year prison life...,اِس کے علاوہ اِس بات کا بھی دعویٰ کرتے ہیں کہ ...
3,lemon drop kid new york city swindler illegall...,اِس کی وجہ یہ ہے کہ اُس کا کہنا ہے کہ اُس کا ک...
4,seventhday adventist church pastor michael cha...,ساتویں سال کے دوران چرچ کی جماعت میں نو بچوں ک...


In [17]:
# french translation
tokenizer_ko, model_ko = load_model('french')
df['summary_french'] = df['processed_summary'].apply(lambda x: translate(x, tokenizer_ko, model_ko))
df[['processed_summary', 'summary_french']].head()

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

Unnamed: 0,processed_summary,summary_french
0,shlykov hardworke taxi driver lyosha saxophoni...,shlykov travail dur chauffeur de taxi lyosha s...
1,nation panem consist wealthy capitol poor dist...,l'homme l'homme l'homme l'homme l'homme l'homm...
2,poovalli induchoodan sentence year prison life...,Il s'agit d'un homme mort mort mort mort mort ...
3,lemon drop kid new york city swindler illegall...,J'ai besoin d'argent pour le ménage j'ai besoi...
4,seventhday adventist church pastor michael cha...,7ème jour adventiste église pasteur michael ch...


In [18]:
df.to_csv('/content/MovieSummaries/MovieSummaries/translated_summaries.csv', index=False)


In [19]:
import os

# Create folders to store audio files
base_path = '/content/MovieSummaries/MovieSummaries/audio'
os.makedirs(base_path + '/arabic', exist_ok=True)
os.makedirs(base_path + '/urdu', exist_ok=True)
os.makedirs(base_path + '/french', exist_ok=True)


In [20]:
!pip install gtts
from gtts import gTTS

def text_to_audio(text, lang_code, output_path):
    tts = gTTS(text=text, lang=lang_code)
    tts.save(output_path)


Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gtts)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gtts
  Attempting uninstall: click
    Found existing installation: click 8.2.0
    Uninstalling click-8.2.0:
      Successfully uninstalled click-8.2.0
Successfully installed click-8.1.8 gtts-2.5.4


In [21]:
# Load the translated summaries
df = pd.read_csv('/content/MovieSummaries/MovieSummaries/translated_summaries.csv')

# Only do it for first 50 (already limited earlier)
for idx, row in df.iterrows():
    movie_id = row['movie_id']

    # Arabic
    ar_text = row['summary_arabic']
    text_to_audio(ar_text, 'ar', f'{base_path}/arabic/{movie_id}_ar.mp3')

    # Urdu
    ur_text = row['summary_urdu']
    text_to_audio(ur_text, 'ur', f'{base_path}/urdu/{movie_id}_ur.mp3')

    # french
    ko_text = row['summary_french']
    text_to_audio(ko_text, 'fr', f'{base_path}/french/{movie_id}_ko.mp3')


In [22]:
import pandas as pd

# Load the final dataset with processed summaries + genres
df = pd.read_csv('/content/MovieSummaries/MovieSummaries/final_dataset.csv')

# Preview
df.head()


Unnamed: 0,movie_id,processed_summary,genre_list
0,23890098,shlykov hardworke taxi driver lyosha saxophoni...,"Drama, World cinema"
1,31186339,nation panem consist wealthy capitol poor dist...,"Action/Adventure, Science Fiction, Action, Drama"
2,20663735,poovalli induchoodan sentence year prison life...,"Musical, Action, Drama, Bollywood"
3,2231378,lemon drop kid new york city swindler illegall...,"Screwball comedy, Comedy"
4,595909,seventhday adventist church pastor michael cha...,"Crime Fiction, Drama, Docudrama, World cinema,..."


In [23]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Load the final dataset with processed summaries + genres
df = pd.read_csv('/content/MovieSummaries/MovieSummaries/final_dataset.csv')

# Preview
df.head()

# Split genre string into list
# Add a check to ensure the value is a string before splitting
df['genre_list'] = df['genre_list'].apply(lambda x: x.split(', ') if isinstance(x, str) and pd.notna(x) else [])

# Binarize genres (multi-hot encoding)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genre_list'])

# Save the genre classes for later use
genre_classes = mlb.classes_

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to convert text into numeric features
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['processed_summary'])

In [28]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Multi-label classifier setup
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)



In [30]:
# Save the trained models and objects
import joblib
import os # Import os if not already imported

save_path = '/content/MovieSummaries/MovieSummaries/'
os.makedirs(save_path, exist_ok=True) # Ensure the directory exists

joblib.dump(model, save_path + 'genre_model.pkl')
joblib.dump(tfidf, save_path + 'tfidf.pkl')
joblib.dump(mlb, save_path + 'label_binarizer.pkl')

print(f"Models and objects saved to {save_path}")

Models and objects saved to /content/MovieSummaries/MovieSummaries/


In [31]:
from sklearn.metrics import accuracy_score, classification_report

# Predict
y_pred = model.predict(X_test)

# Accuracy (not too meaningful in multi-label, but still)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed report
print(classification_report(y_test, y_pred, target_names=genre_classes))

Accuracy: 0.07167397227816609
                                          precision    recall  f1-score   support

                               Absurdism       0.00      0.00      0.00        12
                            Acid western       0.00      0.00      0.00         1
                                  Action       0.64      0.27      0.38      1164
                           Action Comedy       0.00      0.00      0.00        25
                        Action Thrillers       0.00      0.00      0.00        83
                        Action/Adventure       0.57      0.15      0.24       695
                         Addiction Drama       0.00      0.00      0.00         7
                                   Adult       0.00      0.00      0.00        27
                               Adventure       0.69      0.17      0.27       660
                        Adventure Comedy       0.00      0.00      0.00        22
                  Airplanes and airports       0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
# Save the trained models and objects
import joblib
import os

save_path = '/content/MovieSummaries/MovieSummaries/'
os.makedirs(save_path, exist_ok=True) # Ensure the directory exists

joblib.dump(model, save_path + 'genre_model.pkl')
joblib.dump(tfidf, save_path + 'tfidf.pkl')
joblib.dump(mlb, save_path + 'label_binarizer.pkl')

print(f"Models and objects saved to {save_path}")

Models and objects saved to /content/MovieSummaries/MovieSummaries/


In [33]:
# 🔧 Step 1: Install required packages
!pip install gradio transformers sentencepiece sacremoses gTTS joblib --quiet
!python -m spacy download en_core_web_sm

# 🎬 Step 2: GUI Version of Filmception
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
from gtts import gTTS
import joblib
import spacy
import re
import string
import os

# Load spaCy
nlp = spacy.load("en_core_web_sm")

# Load saved model files from Google Drive - These should now exist
model = joblib.load('/content/MovieSummaries/MovieSummaries/genre_model.pkl')
tfidf = joblib.load('/content/MovieSummaries/MovieSummaries/tfidf.pkl')
mlb = joblib.load('/content/MovieSummaries/MovieSummaries/label_binarizer.pkl')

# Translation model names
lang_models = {
    'arabic': 'Helsinki-NLP/opus-mt-en-ar',
    'urdu': 'Helsinki-NLP/opus-mt-en-ur',
    'french': 'Helsinki-NLP/opus-mt-en-fr'
}

# gTTS language codes
gtts_lang = {
    'arabic': 'ar',
    'urdu': 'ur',
    'french': 'fr'
}

# Cleaning and preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

def translate(text, lang):
    model_name = lang_models[lang]
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    # Load model_trans inside the function to avoid loading all models at startup
    # potentially causing memory issues, though it might increase latency on first call
    model_trans = MarianMTModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model_trans.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def text_to_audio(text, lang_code='en'):
    tts = gTTS(text=text, lang=lang_code)
    # Use a unique or temporary path if multiple users or instances might run
    path = '/content/translated_audio.mp3'
    tts.save(path)
    return path

# GUI function
def filmception_gui(summary, lang_choice):
    if not summary:
        return "Please enter a summary.", None, ""
    cleaned = clean_text(summary)
    processed = preprocess(cleaned)

    # Only translate if a language is selected
    translated_text = ""
    audio_output_path = None
    if lang_choice:
        translated_text = translate(processed, lang_choice)
        audio_output_path = text_to_audio(translated_text, gtts_lang[lang_choice])
    else:
        translated_text = "No language selected for translation."


    X_input = tfidf.transform([processed])
    prediction = model.predict(X_input)
    genres = mlb.inverse_transform(prediction)[0] # .inverse_transform returns a list of lists, take the first element
    genre_str = ", ".join(genres) if genres else "No genre detected."

    return translated_text, audio_output_path, genre_str


# Gradio interface
demo = gr.Interface(
    fn=filmception_gui,
    inputs=[
        gr.Textbox(label="🎬 Enter Movie Summary"),
        gr.Dropdown(choices=['arabic', 'urdu', 'french'], label="🌐 Select Language") # Make this optional or have a default if genre prediction is independent
    ],
    outputs=[
        gr.Textbox(label="📝 Translated Summary"),
        gr.Audio(label="🔊 Translated Audio", type="filepath"),
        gr.Textbox(label="🎯 Predicted Genres")
    ],
    title="🎥 Filmception - Translate, Listen & Predict Genre",
    description="Paste a movie summary. Choose a language. Get translation, audio & genre prediction."
)


demo.launch()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.1/323.1 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m128.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-a

