In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Function to balance classes using oversampling
def balance_classes(data, major_class, minority_classes, oversampler):
    # Separate features and target variable
    X = data['Description']
    y = data['Genre']

    # Initialize oversampler
    oversampler = RandomOverSampler(sampling_strategy='minority')

    # Upsample the minority classes
    upsampled_data, upsampled_labels = oversampler.fit_resample(X.values.reshape(-1, 1), y)

    # Convert the oversampled data and labels back to a DataFrame
    upsampled_df = pd.DataFrame(upsampled_data, columns=['Description'])
    upsampled_df['Genre'] = upsampled_labels

    return upsampled_df

# Read the data
data = pd.read_csv('/content/train_data.txt', sep=':::', header=None, names=['Movie', 'Title', 'Genre', 'Description'])

print("\nMissing values:\n", data.isnull().sum())

print("\nClass Distribution:\n", data['Genre'].value_counts())

# Check for empty documents
empty_documents = data['Description'].apply(lambda x: len(str(x).split()) == 0)
print("\nNumber of empty documents:", empty_documents.sum())

# Drop empty documents
data = data[~empty_documents]

# Update the oversampling function call
upsampled_data = balance_classes(data, 'drama', ['documentary'], RandomOverSampler())

# Separate features and target variable in the upsampled data
X_train_upsampled = upsampled_data['Description']
y_train_upsampled = upsampled_data['Genre']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_upsampled, y_train_upsampled, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vectorized)

# Evaluate the model
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


  data = pd.read_csv('/content/train_data.txt', sep=':::', header=None, names=['Movie', 'Title', 'Genre', 'Description'])



Missing values:
 Movie          0
Title          0
Genre          0
Description    0
dtype: int64

Class Distribution:
  drama           13613
 documentary     13096
 comedy           7447
 short            5073
 horror           2204
 thriller         1591
 action           1315
 western          1032
 reality-tv        884
 family            784
 adventure         775
 music             731
 romance           672
 sci-fi            647
 adult             590
 crime             505
 animation         498
 sport             432
 talk-show         391
 fantasy           323
 mystery           319
 musical           277
 biography         265
 history           243
 game-show         194
 news              181
 war               132
Name: Genre, dtype: int64

Number of empty documents: 0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
                precision    recall  f1-score   support

      action        0.59      0.07      0.12       240
       adult        0.89      0.07      0.13       110
   adventure        0.67      0.04      0.07       165
   animation        0.00      0.00      0.00        93
   biography        0.00      0.00      0.00        54
      comedy        0.51      0.42      0.46      1482
       crime        0.00      0.00      0.00       119
 documentary        0.57      0.86      0.69      2600
       drama        0.45      0.84      0.59      2649
      family        0.00      0.00      0.00       157
     fantasy        0.00      0.00      0.00        67
   game-show        1.00      0.20      0.33        35
     history        0.00      0.00      0.00        47
      horror        0.68      0.31      0.42       459
       music        0.70      0.05      0.09       147
     musical        0.00      0.00      0.00        39
     mystery        0.00      0.00     