In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [None]:
dataset_path = "/content/drive/MyDrive/Genre Classification Dataset"

# Load train and test data
train_df = pd.read_csv(os.path.join(dataset_path, 'train_data.txt'), sep=' ::: ', engine='python', names=['id', 'title', 'genre', 'description'])
test_df = pd.read_csv(os.path.join(dataset_path, 'test_data.txt'), sep=' ::: ', engine='python', names=['id', 'title', 'description'])
test_solution_df = pd.read_csv(os.path.join(dataset_path, 'test_data_solution.txt'), sep=' ::: ', engine='python', names=['id', 'title', 'genre', 'description'])

print(" Train shape:", train_df.shape)
print(" Test shape:", test_df.shape)
print(" Train sample:")
print(train_df.head())

 Train shape: (54214, 4)
 Test shape: (54200, 3)
 Train sample:
   id                             title     genre  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         description  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  


In [None]:
train_df = train_df.dropna(subset=['genre'])

# Encode genre labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['genre'])

# Save label encoder if needed
import joblib
joblib.dump(label_encoder, "label_encoder.pkl")

# Features: movie descriptions
X_train_text = train_df['description']
X_test_text = test_df['description']

print("Labels encoded:", set(y_train))

Labels encoded: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26)}


In [None]:
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

print("TF-IDF shapes - Train:", X_train_tfidf.shape, "Test:", X_test_tfidf.shape)

TF-IDF shapes - Train: (54214, 10000) Test: (54200, 10000)


In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on train (just as quick check)
y_pred_train_nb = nb_model.predict(X_train_tfidf)
print("NB Train Accuracy:", accuracy_score(y_train, y_pred_train_nb))

NB Train Accuracy: 0.538975172464677


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

y_pred_train_lr = lr_model.predict(X_train_tfidf)
print("LR Train Accuracy:", accuracy_score(y_train, y_pred_train_lr))


LR Train Accuracy: 0.6883830744826059


In [None]:
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

y_pred_train_svm = svm_model.predict(X_train_tfidf)
print("SVM Train Accuracy:", accuracy_score(y_train, y_pred_train_svm))

SVM Train Accuracy: 0.8482126388017855


In [None]:

y_test_pred_nb = nb_model.predict(X_test_tfidf)
y_test_pred_lr = lr_model.predict(X_test_tfidf)
y_test_pred_svm = svm_model.predict(X_test_tfidf)

genres_nb = label_encoder.inverse_transform(y_test_pred_nb)
genres_lr = label_encoder.inverse_transform(y_test_pred_lr)
genres_svm = label_encoder.inverse_transform(y_test_pred_svm)

print("NB sample predictions:", genres_nb[:5])
print("LR sample predictions:", genres_lr[:5])
print("SVM sample predictions:", genres_svm[:5])


NB sample predictions: ['drama' 'drama' 'documentary' 'drama' 'drama']
LR sample predictions: ['drama' 'drama' 'documentary' 'drama' 'drama']
SVM sample predictions: ['drama' 'drama' 'documentary' 'drama' 'drama']


In [None]:

pd.DataFrame({'id': test_df['id'], 'predicted_genre_nb': genres_nb}) \
  .to_csv(os.path.join(dataset_path, 'predictions_nb.csv'), index=False)

pd.DataFrame({'id': test_df['id'], 'predicted_genre_lr': genres_lr}) \
  .to_csv(os.path.join(dataset_path, 'predictions_lr.csv'), index=False)

pd.DataFrame({'id': test_df['id'], 'predicted_genre_svm': genres_svm}) \
  .to_csv(os.path.join(dataset_path, 'predictions_svm.csv'), index=False)

print(" All predictions and models saved to:", dataset_path)


 All predictions and models saved to: /content/drive/MyDrive/Genre Classification Dataset
