Install necessary libraries

In [1]:
!pip install transformers torch



##### Step 2.2: Tokenize the Audio Text Files

In [2]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
import re
import csv

import nltk
nltk.download('stopwords')

def read_text_file(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
        return file.read()

def tokenize_text(text):
    tokenizer = RegexpTokenizer(r"\w+")
    stop_words = stopwords.words("english")
    tokens = tokenizer.tokenize(text)
    tokens = [i.lower().strip("_") for i in tokens]
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [i for i in tokens if i != '']
    return tokens

def sanitize_filename(filename):
    base_name = os.path.splitext(filename)[0]
    sanitized = re.sub(r'[^\w\s]', '', base_name)
    sanitized = re.sub(r'\s+', '_', sanitized)
    return sanitized + '.txt'

def process_directory(input_dir, output_dir, csv_output_file):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    data = []
    with open(csv_output_file, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['filename', 'text', 'genre'])  # Include genre column in CSV
        
        for filename in os.listdir(input_dir):
            if filename.endswith('.txt'):
                file_path = os.path.join(input_dir, filename)
                text = read_text_file(file_path)
                tokens = tokenize_text(text)
                sanitized_filename = sanitize_filename(filename)
                output_file = os.path.join(output_dir, sanitized_filename)
                with open(output_file, 'w', encoding='utf-8', errors='ignore') as file:
                    file.write("\n".join(tokens))
                genre = "unknown"  # Placeholder; will replace with actual genre extraction logic
                data.append({'filename': sanitized_filename, 'text': " ".join(tokens), 'genre': genre})
                csvwriter.writerow([sanitized_filename, " ".join(tokens), genre])
                print(f'Processed {filename} -> {sanitized_filename}')
    
    return pd.DataFrame(data)


input_dir = '/kaggle/input/dataset1/input/input'
output_dir = '/kaggle/working/tokenize'
csv_output_file = '/kaggle/working/tokenized_texts.csv'
data_df = process_directory(input_dir, output_dir, csv_output_file)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Processed Three_Reasons_La_haine_M2Wl4li8TFo_1.txt -> Three_Reasons_La_haine_M2Wl4li8TFo_1.txt
Processed IL_POSTINO_TRAILER_0or7hSz7gc.txt -> IL_POSTINO_TRAILER_0or7hSz7gc.txt
Processed A_Midwinters_Tale_In_the_Bleak_Midwinter_Trailer_vBS92ui_h74.txt -> A_Midwinters_Tale_In_the_Bleak_Midwinter_Trailer_vBS92ui_h74.txt
Processed Home_For_The_Holidays_Trailer_1995_ThH8WocPRM0.txt -> Home_For_The_Holidays_Trailer_1995_ThH8WocPRM0.txt
Processed Copycat_1995_trailer_lsmXhM4yfU0.txt -> Copycat_1995_trailer_lsmXhM4yfU0.txt
Processed Hackers_Trailer_Ql1uLyuWra8.txt -> Hackers_Trailer_Ql1uLyuWra8.txt
Processed Batman_Forever_1995_Official_Theatrical_Teaser_on_HD_b_KllgySsPo.txt -> Batman_Forever_1995_Official_Theatrical_Teaser_on_HD_b_KllgySsPo.txt
Processed Casino_Official_Trailer_1_1995_HD_EJXDMwGWhoA.txt -> Casino_Official_Trailer_1_1995_HD_EJXDMwGWhoA.txt
Processed How_

##### Step 2.3: Assign Labels Using BERT Embedding, K-Means, and Genre Keywords

In [8]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Get BERT embeddings for each text
embeddings = []
valid_indices = []

for idx, text in enumerate(data_df['text']):
    try:
        embedding = get_bert_embeddings(text)
        embeddings.append(embedding)
        valid_indices.append(idx)
    except Exception as e:
        print(f"Error processing text at index {idx}: {e}")

# Create a new DataFrame with only valid entries
valid_data_df = data_df.iloc[valid_indices].reset_index(drop=True)

# Stack embeddings into a feature matrix
X = np.vstack(embeddings)

# Use KMeans clustering to suggest genres
num_clusters = 20  # Number of genres
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
valid_data_df['cluster'] = kmeans.fit_predict(X)

# Define genre keywords
genre_keywords = {
    'Drama': ['life', 'relationship', 'family', 'emotional'],
    'Comedy': ['funny', 'humor', 'laugh', 'joke'],
    'Thriller': ['suspense', 'mystery', 'crime', 'investigation'],
    'Action': ['fight', 'battle', 'hero', 'adventure'],
    'Romance': ['love', 'romantic', 'affair', 'relationship'],
    'Sci-Fi': ['future', 'space', 'alien', 'technology'],
    'Horror': ['scary', 'fear', 'haunted', 'ghost'],
    'Mystery': ['mystery', 'detective', 'crime', 'investigation'],
    'Fantasy': ['magic', 'wizard', 'dragon', 'kingdom'],
    'Adventure': ['explore', 'journey', 'quest', 'adventure'],
    'Animation': ['animated', 'cartoon', 'character', 'kid'],
    'Family': ['family', 'kid', 'children', 'parent'],
    'Biography': ['life', 'story', 'biography', 'person'],
    'History': ['historical', 'war', 'past', 'history'],
    'Sport': ['sport', 'game', 'competition', 'team'],
    'War': ['war', 'battle', 'military', 'soldier'],
    'Western': ['cowboy', 'western', 'ranch', 'desert'],
    'Crime': ['crime', 'criminal', 'police', 'detective'],
    'Music': ['music', 'band', 'song', 'concert', 'band', 'perform']
}

# Manually inspect a sample from each cluster and assign genres
sample_per_cluster = 5
for cluster in range(num_clusters):
    cluster_texts = valid_data_df[valid_data_df['cluster'] == cluster]['text']
    sample_size = min(sample_per_cluster, len(cluster_texts))
    if sample_size > 0:
        sample_texts = cluster_texts.sample(n=sample_size)
        print(f"Cluster {cluster} Sample Texts:\n", sample_texts)

# Assign genres based on keywords
def assign_genre_based_on_keywords(text, genre_keywords):
    for genre, keywords in genre_keywords.items():
        if any(keyword in text.lower() for keyword in keywords):
            return genre
    return 'Unknown'

valid_data_df['genre'] = valid_data_df['text'].apply(lambda text: assign_genre_based_on_keywords(text, genre_keywords))

# Display the DataFrames
print("DataFrame with clusters and assigned genres:")
print(valid_data_df[['filename', 'text', 'cluster', 'genre']].head())

# Save DataFrame to file
valid_data_df.to_csv('/kaggle/working/processed_data.csv', index=False)

Cluster 0 Sample Texts:
 45     world overlooked long forgotten settled comman...
50     take part call last night came end long journe...
128    visiting bar walked saw face face mean every s...
83     perhaps hitler best known victim anne frank re...
Name: text, dtype: object
Cluster 1 Sample Texts:
 31    person looking quite well known killed 14 peop...
63    air water homes oh god cannot seen cannot hear...
61    live age information sitting perfect beach wor...
25    headed east 72nd towards park new york cop nos...
42    deepest jungles africa two tribes fight savage...
Name: text, dtype: object
Cluster 2 Sample Texts:
 4      turns killer suffering death another human det...
103    certifiable mckay certifiable detective christ...
91     brilliant wearing flu proof perfect crime miam...
127    year 2021 longer safe transmit information pho...
123    like living things see wear blinders sometimes...
Name: text, dtype: object
Cluster 3 Sample Texts:
 94     kids best enemies grow

#### Step 3.1: Partioning Methodology

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load the tokenized data
data_df = pd.read_csv('/kaggle/working/processed_data.csv')

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the text data
X = tfidf_vectorizer.fit_transform(data_df['text'])
y = data_df['genre']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data splitting completed:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

Data splitting completed:
Training set: 112 samples
Testing set: 28 samples


#### Step 3.2: Balancing Methodology

In [10]:
from sklearn.utils import resample

# Combine X_train and y_train into a DataFrame for balancing
train_df = pd.DataFrame(X_train.toarray())
train_df['genre'] = y_train.values

# Upsample minority classes
majority_class = train_df['genre'].value_counts().idxmax()
majority_df = train_df[train_df['genre'] == majority_class]

balanced_data = []

for genre in train_df['genre'].unique():
    genre_df = train_df[train_df['genre'] == genre]
    if genre != majority_class:
        upsampled_df = resample(genre_df, replace=True, n_samples=len(majority_df), random_state=42)
        balanced_data.append(upsampled_df)
    else:
        balanced_data.append(genre_df)

balanced_train_df = pd.concat(balanced_data)

# Separate features and labels
X_train_balanced = balanced_train_df.drop('genre', axis=1)
y_train_balanced = balanced_train_df['genre']

# Check balanced distribution
print("Balanced training set genre distribution:")
print(y_train_balanced.value_counts())

Balanced training set genre distribution:
genre
Romance      50
Biography    50
Drama        50
Action       50
Horror       50
Adventure    50
Thriller     50
Unknown      50
Comedy       50
Mystery      50
Crime        50
Sci-Fi       50
Animation    50
Sport        50
History      50
Name: count, dtype: int64


#### Step 4: Model Training ( SVM Classifier)

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Convert training data to dense format
X_train_balanced_dense = X_train_balanced.values

# Hyperparameter tuning
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_balanced_dense, y_train_balanced)

# Best parameters found
best_params = grid_search.best_params_

# Train the SVM classifier with best parameters
svm_classifier = SVC(C=best_params['C'], kernel=best_params['kernel'], random_state=42)
svm_classifier.fit(X_train_balanced_dense, y_train_balanced)

print("Model training completed.")

Model training completed.


#### Step 5: Performance metric calculation (SVM Classifier)

In [12]:
from sklearn.metrics import classification_report, accuracy_score

# Convert test data to dense format
X_test_dense = X_test.toarray()

# Predict on the testing set
y_pred = svm_classifier.predict(X_test_dense)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")
print("Classification report:")
print(report)

# Save the model and vectorizer
import joblib
joblib.dump(svm_classifier, '/kaggle/working/svm_classifier.pkl')
joblib.dump(tfidf_vectorizer, '/kaggle/working/tfidf_vectorizer.pkl')

Model accuracy: 0.46
Classification report:
              precision    recall  f1-score   support

      Action       0.00      0.00      0.00         2
   Animation       0.00      0.00      0.00         1
   Biography       0.00      0.00      0.00         1
      Comedy       0.00      0.00      0.00         2
       Crime       1.00      1.00      1.00         1
       Drama       0.44      1.00      0.62        12
     History       0.00      0.00      0.00         1
     Romance       0.00      0.00      0.00         3
      Sci-Fi       0.00      0.00      0.00         1
       Sport       0.00      0.00      0.00         1
    Thriller       0.00      0.00      0.00         1
     Unknown       0.00      0.00      0.00         2

    accuracy                           0.46        28
   macro avg       0.12      0.17      0.13        28
weighted avg       0.23      0.46      0.30        28



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['/kaggle/working/tfidf_vectorizer.pkl']

#### Step 4, 5: Model Training and performance metric calculation (Random Forest Classifier)

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

# Train the Random Forest classifier with best parameters
rf_classifier = RandomForestClassifier(**grid_search.best_params_, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")
print("Classification report:")
print(report)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters: {'max_depth': 30, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}


  warn(


Model accuracy: 0.54
Classification report:
              precision    recall  f1-score   support

      Action       0.00      0.00      0.00         2
   Animation       0.00      0.00      0.00         1
   Biography       0.00      0.00      0.00         1
      Comedy       0.00      0.00      0.00         2
       Crime       1.00      1.00      1.00         1
       Drama       0.52      1.00      0.69        12
     History       0.00      0.00      0.00         1
     Romance       0.00      0.00      0.00         3
      Sci-Fi       0.00      0.00      0.00         1
       Sport       0.00      0.00      0.00         1
    Thriller       0.00      0.00      0.00         1
     Unknown       0.50      1.00      0.67         2

    accuracy                           0.54        28
   macro avg       0.17      0.25      0.20        28
weighted avg       0.30      0.54      0.38        28



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
