# **MOVIE GENRE CLASSIFICATION MODEL**

# Kaggle and Dataset Installation

In [1]:
! pip install -q kaggle

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"harsiniab","key":"56af923de82edd603e3be257f327f183"}'}

In [3]:
!mkdir ~/.kaggle

In [4]:
!cp kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle datasets list

ref                                             title                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------------------------------  ---------------------------------  -----  -------------------  -------------  ---------  ---------------  
syedanwarafridi/vehicle-sales-data              Vehicle Sales Data                  19MB  2024-02-21 20:16:17           7054        120  1.0              
tarunrm09/climate-change-indicators             Climate change Indicators           34KB  2024-02-22 08:53:54           3481         86  1.0              
nbroad/gemma-rewrite-nbroad                     gemma-rewrite-nbroad                 8MB  2024-03-03 04:52:39            283         44  1.0              
nelgiriyewithana/emotions                       Emotions                            16MB  2024-02-05 16:01:39           5386        155  1.0              
nelgiriyewithana/apple-quality                  Apple Quality         

In [7]:
!kaggle datasets download -d hijest/genre-classification-dataset-imdb

Downloading genre-classification-dataset-imdb.zip to /content
 60% 25.0M/41.7M [00:00<00:00, 64.3MB/s]
100% 41.7M/41.7M [00:00<00:00, 86.8MB/s]


In [8]:
! unzip "genre-classification-dataset-imdb.zip"

Archive:  genre-classification-dataset-imdb.zip
  inflating: Genre Classification Dataset/description.txt  
  inflating: Genre Classification Dataset/test_data.txt  
  inflating: Genre Classification Dataset/test_data_solution.txt  
  inflating: Genre Classification Dataset/train_data.txt  


# Importing libraries

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer

In [10]:
# Load NLTK resources
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Data loading

In [11]:
# Load data
train_path = "/content/Genre Classification Dataset/train_data.txt"
test_path = "/content/Genre Classification Dataset/test_data.txt"

In [12]:
#Load train and test data into dataframes
train_data = pd.read_csv(train_path, sep=":::", names=["TITLE", "GENRE", "DESCRIPTION"], engine="python")
test_data = pd.read_csv(test_path, sep=":::", names=["TITLE", "GENRE", "DESCRIPTION"], engine="python")

# Text Cleaning

In [13]:
# Define function for text cleaning
stemmer = LancasterStemmer()
stop_words = set(stopwords.words("english"))


In [14]:
def cleaning_data(text):
    text = text.lower()
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'.pic\S+', '', text)
    text = re.sub(r'[^a-zA-Z+]', ' ', text)
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.word_tokenize(text)
    text = " ".join([i for i in words if i not in stop_words and len(i) > 2])
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [15]:
# Apply text cleaning to both train and test dataset
train_data["TextCleaning"] = train_data["DESCRIPTION"].apply(cleaning_data)
test_data["TextCleaning"] = test_data["DESCRIPTION"].apply(cleaning_data)


# Model Building and Training

In [16]:
# Vectorize the text data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(train_data["TextCleaning"])
X_test = vectorizer.transform(test_data["TextCleaning"])


In [17]:
# Define labels
y_train = train_data["GENRE"]


In [18]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [19]:
# Initialize and train Naive Bayes model
nbmodel = MultinomialNB()
nbmodel.fit(X_train, y_train)


In [20]:
# Predict on validation set
y_pred = nbmodel.predict(X_val)

In [21]:
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.4460942543576501


In [24]:
# Function to predict genre from a given description
def predict_genre(description):
    cleaned_text = cleaning_data(description)
    vectorized_text = vectorizer.transform([cleaned_text])
    predicted_genre = nbmodel.predict(vectorized_text)
    return predicted_genre[0]

In [25]:
# Test the function with a sample description
sample_description = "A group of friends embark on a journey to find a lost treasure."
predicted_genre = predict_genre(sample_description)
print("Predicted Genre:", predicted_genre)

Predicted Genre:  drama 
