In [None]:
# Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read the train, test, and test solution data
train_data = pd.read_csv('/content/drive/MyDrive/CodSoft/Genre Classification Dataset/train_data.txt',
                         sep=':::',names=['Id', 'Title', 'Genre', 'Description'], engine='python')
test_data = pd.read_csv('/content/drive/MyDrive/CodSoft/Genre Classification Dataset/test_data.txt',
                        sep=':::',names=['Id', 'Title', 'Description'], engine='python')
test_solution = pd.read_csv('/content/drive/MyDrive/CodSoft/Genre Classification Dataset/test_data_solution.txt',
                            sep=':::',names=['Id','Title', 'Genre', 'Description'], engine='python')


In [None]:
# Save the dataframes as CSV files
train_data.to_csv('/content/drive/MyDrive/CodSoft/Genre Classification Dataset/train_data.csv', index=False)
test_data.to_csv('/content/drive/MyDrive/CodSoft/Genre Classification Dataset/test_data.csv', index=False)
test_solution.to_csv('/content/drive/MyDrive/CodSoft/Genre Classification Dataset/test_data_solution.csv', index=False)

In [None]:
# Print column names to verify
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)
print("Test Solution Data Columns:", test_solution.columns)

Train Data Columns: Index(['Id', 'Title', 'Genre', 'Description'], dtype='object')
Test Data Columns: Index(['Id', 'Title', 'Description'], dtype='object')
Test Solution Data Columns: Index(['Id', 'Title', 'Genre', 'Description'], dtype='object')


In [None]:
# Split the train data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['Description'], train_data['Genre'], test_size=0.2, random_state=42)

In [None]:
# Create a TF-IDF vectorizer and Naive Bayes classifier pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [None]:
# Train the model
model.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
y_val_pred = model.predict(X_val)

In [None]:
# Evaluate the model on the validation set
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

Validation Accuracy: 0.4457253527621507


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Classification Report:
                precision    recall  f1-score   support

      action        0.00      0.00      0.00       263
       adult        0.00      0.00      0.00       112
   adventure        0.00      0.00      0.00       139
   animation        0.00      0.00      0.00       104
   biography        0.00      0.00      0.00        61
      comedy        0.66      0.03      0.06      1443
       crime        0.00      0.00      0.00       107
 documentary        0.54      0.90      0.67      2659
       drama        0.38      0.89      0.53      2697
      family        0.00      0.00      0.00       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.00      0.00      0.00        40
     history        0.00      0.00      0.00        45
      horror        0.00      0.00      0.00       431
       music        0.00      0.00      0.00       144
     musical        0.00      0.00      0.00        50
     mystery        0.00     

In [None]:
# Make predictions on the test set
y_test_pred = model.predict(test_data['Description'])

In [None]:
# Evaluate the model on the test set
print("Test Accuracy:", accuracy_score(test_solution['Genre'], y_test_pred))
print("Test Classification Report:\n", classification_report(test_solution['Genre'], y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(test_solution['Genre'], y_test_pred))

Test Accuracy: 0.4418819188191882


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Classification Report:
                precision    recall  f1-score   support

      action        0.00      0.00      0.00      1314
       adult        0.00      0.00      0.00       590
   adventure        0.00      0.00      0.00       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.62      0.03      0.05      7446
       crime        0.00      0.00      0.00       505
 documentary        0.54      0.89      0.67     13096
       drama        0.38      0.88      0.53     13612
      family        0.00      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.00      0.00      0.00       193
     history        0.00      0.00      0.00       243
      horror        0.00      0.00      0.00      2204
       music        0.00      0.00      0.00       731
     musical        0.00      0.00      0.00       276
     mystery        0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(y_test_pred)

[' drama ' ' drama ' ' documentary ' ... ' drama ' ' drama '
 ' documentary ']


In [None]:
# Save the predictions for further analysis
test_data['Predicted_Genre'] = y_test_pred
test_data.to_csv('/content/drive/MyDrive/CodSoft/Genre Classification Dataset/test_data_predictions.csv', index=False)

print("Predictions have been saved to CSV.")

Predictions have been saved to CSV.
