# Genre Classification using BERT
 - Genre Classification Dataset IMDB
Dataset - https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb

# Converting data into dataframe

In [71]:
import numpy as np
import pandas as pd

file_path_train = '/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt'
file_path_test = '/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt'

# Read the data from the file and split it into lines
with open(file_path_train, 'r', encoding='utf-8') as file:
    lines_train = file.readlines()
with open(file_path_test, 'r', encoding='utf-8') as file:
    lines_test = file.readlines()

# Initialize lists to store data
movie_names_train = []
genres_train = []
reviews_train = []
movie_names_test = []
genres_test = []
reviews_test = []

# Process each line and extract relevant information
for line in lines_train:
    parts = line.split(' ::: ')
    movie_names_train.append(parts[1])
    genres_train.append(parts[2])
    reviews_train.append(parts[3].strip())
for line in lines_test:
    parts = line.split(' ::: ')
    movie_names_test.append(parts[1])
    genres_test.append(parts[2])
    reviews_test.append(parts[3].strip())

# Create a DataFrame
df1 = pd.DataFrame({
    'Movie Name': movie_names_train,
    'Genre': genres_train,
    'Review': reviews_train
})
df2 = pd.DataFrame({
    'Movie Name': movie_names_test,
    'Genre': genres_test,
    'Review': reviews_test
})

In [72]:
df1.head()

Unnamed: 0,Movie Name,Genre,Review
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


# Checking for null values and duplicates

In [73]:
null_values1 = df1.isnull().sum()
null_values2 = df2.isnull().sum()
print(null_values1)
print(null_values2)

Movie Name    0
Genre         0
Review        0
dtype: int64
Movie Name    0
Genre         0
Review        0
dtype: int64


In [74]:
duplicate_rows1 = df1.duplicated().sum()
duplicate_rows2 = df2.duplicated().sum()
print(duplicate_rows1)
print(duplicate_rows2)

0
0


# Text Preprocessing
remove numbers, lowercasing, remove html tags, remove urls, remove punctuation, chat word treatment, removing stop words, handling emojis, tokenization, lemmatization

In [75]:
import nltk
import re
import html
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [80]:
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Chat word treatment (customize as needed)
    chat_words_map = {'gr8': 'great', 'u': 'you', 'r': 'are', 'lol': 'laughing out loud'}
    text = ' '.join(chat_words_map.get(word, word) for word in text.split())
    
    words = word_tokenize(text)
    words = [chat_words_map.get(word, word) for word in words]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Join the words back into a string
    text = ' '.join(words)

    return text


In [81]:
df1['Review'] = df1['Review'].apply(preprocess_text)
df2['Review'] = df2['Review'].apply(preprocess_text)

  text = BeautifulSoup(text, 'html.parser').get_text()


In [83]:
df1['Review'].fillna('',inplace=True)
df2['Review'].fillna('',inplace=True)

In [84]:
df1['Genre'].value_counts().index

Index(['drama', 'documentary', 'comedy', 'short', 'horror', 'thriller',
       'action', 'western', 'reality-tv', 'family', 'adventure', 'music',
       'romance', 'sci-fi', 'adult', 'crime', 'animation', 'sport',
       'talk-show', 'fantasy', 'mystery', 'musical', 'biography', 'history',
       'game-show', 'news', 'war'],
      dtype='object', name='Genre')

# Replacing "genre" with integer values
drama -> 1
action -> 2 and so on

In [85]:
i = 1;
mp = {}
for genres in df1['Genre'].value_counts().index:
    mp[genres] = i;
    i = i+1;
def func(text):
    return mp[text]
df1['Genre'] = df1['Genre'].apply(func)
df2['Genre'] = df2['Genre'].apply(func)

In [86]:
df1.head()

Unnamed: 0,Movie Name,Genre,Review
0,Oscar et la dame rose (2009),1,listening conversation doctor parents yearold ...
1,Cupid (1997),6,brother sister past incestuous relationship cu...
2,"Young, Wild and Wonderful (1980)",15,bus empties students field trip museum natural...
3,The Secret Sin (1915),1,help unemployed father make ends meet edith tw...
4,The Unrecovered (2007),1,films title refers unrecovered bodies ground z...


In [87]:
max(df1['Genre'])

27

In [88]:
x_train = df1['Review']
y_train = df1['Genre']
x_test = df2['Review']
y_test = df2['Genre']

In [89]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, num_classes=28)
y_test = to_categorical(y_test, num_classes=28)

Installing tensorflow text

In [90]:
!pip install tensorflow-text



In [91]:
import tensorflow_hub as hub
import tensorflow_text as text

# Links to pretrained BERT models and their associated preprocessing components
**bert_preprocess** : This is the link to a BERT preprocessing module. It's designed to handle text input and prepare it for consumption by a BERT model.

**bert_encoder ***:  This is the link to the actual BERT model. It consists of pre-trained weights and architecture for a BERT model with a specific configuration. The model is trained on English text (hence "bert_en_uncased").

In [92]:
preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

In [93]:
bert_preprocess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

# Brief Revision of BERT before we go ahead
input text -> bert_preprocess -> output-> bert_encoder(model) -> final_output(vector)(input) -> Neural Network Layers

input text -> BERT -> 768 dimension vector -> Neural Network Layers

In [94]:
input_text = ['My name is Rahul','I love programming']
final_output = bert_encoder(bert_preprocess(input_text))
final_output.keys()

dict_keys(['sequence_output', 'default', 'pooled_output', 'encoder_outputs'])

In [95]:
final_output['pooled_output']

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.9196574 , -0.61891806, -0.89399797, ..., -0.8988503 ,
        -0.80019486,  0.92012835],
       [-0.8454432 , -0.3008423 , -0.1001676 , ...,  0.14011891,
        -0.6209183 ,  0.8614851 ]], dtype=float32)>

(2 , 768), we have two input_texts and each is expressed using (1 x 768) dimension

We can even see cosine similarity

In [96]:
input_text = ['Cat','fevicol','phone','computer']
final_o= bert_encoder(bert_preprocess(input_text))['pooled_output']

In [97]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([final_o[0]],[final_o[1]])

array([[0.93107873]], dtype=float32)

In [98]:
cosine_similarity([final_o[2]],[final_o[3]])

array([[0.9635618]], dtype=float32)

# Coming back to original code:
Instead of Sequential Model we will use Functional Model

In [99]:
import tensorflow as tf

In [100]:
# BERT Layers

text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

print(outputs['pooled_output'].shape)

# Neural Network Layers
first = tf.keras.layers.Dropout(0.1,name='dropout')(outputs['pooled_output'])
second = tf.keras.layers.Dense(28,activation='softmax',name='output')(first)

# Construct the final model
model = tf.keras.Model(inputs=[text_input], outputs=[second])

(None, 768)


In [103]:
metrics = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=metrics)

In [104]:
model.fit(x_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b677e664520>

In order to improve Precision you can apply Improvements : 
1. Balancing the dataset, 
2. applying lemmatization, 
3. increase number of layers in Neural Network, 
4. increase number of epochs

In [105]:
model.evaluate(x_test,y_test)



[1.4628050327301025, 0.971804141998291, 0.710128903388977, 0.35571956634521484]

In [106]:
y_pred = model.predict(x_test)
y_pred



array([[1.3002587e-09, 2.3652449e-01, 1.3048794e-02, ..., 1.7465820e-05,
        4.6980629e-05, 9.0049209e-05],
       [1.2543445e-09, 5.2806872e-01, 1.6188630e-01, ..., 1.2361156e-04,
        1.5781524e-04, 1.1256406e-03],
       [4.6588233e-10, 2.1422847e-01, 6.7257762e-01, ..., 4.6716439e-05,
        2.2795009e-03, 1.0213321e-03],
       ...,
       [2.2257744e-10, 5.4606032e-02, 8.9485468e-03, ..., 7.6968056e-05,
        2.9863711e-04, 9.3365852e-05],
       [3.9886334e-09, 9.4711795e-02, 1.0615982e-01, ..., 7.9094392e-04,
        4.2500962e-03, 1.7319227e-04],
       [4.4762072e-10, 1.6834521e-01, 5.1772457e-01, ..., 1.0304673e-05,
        3.9264723e-04, 2.5974287e-04]], dtype=float32)

In [107]:
y_pred_copy = y_pred

In [108]:
for i in range(0,len(y_pred_copy)):
    maximum_value = max(y_pred_copy[i]);
    for j in range(0,len(y_pred_copy[i])):
        if y_pred_copy[i][j]!=maximum_value:
            y_pred_copy[i][j] = 0;
        else:
            y_pred_copy[i][j] = 1;

In [110]:
y_pred_copy.shape

(54200, 28)

In [111]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f'Accuracy: {accuracy}')

Accuracy: 0.548929889298893


In [121]:
def return_genre(value1):
    for key,value in mp.items():
        if value==value1:
            return key;

# Prediction 1

In [126]:
text = """The goal of this documentary is to inform people about the harsh and very real realities of fat shaming and fat hatred - to expose how fat hatred permeates our popular culture, spreading the message that fat is bad and in turn forwarding the idea that being cruel, unkind or downright unjust to a fat person is acceptable behavior. And then, to inspire change."""
text = preprocess_text(text)
text = [text]
pred1 = model.predict(text)
max_val = max(pred1[0])
for i in range(0,len(pred1[0])):
    if pred1[0][i] == max_val:
        print(return_genre(i))
        break;
# Expected -> documentary

documentary


# Prediction 2

In [127]:
text = """'THE BEAR' is a romantic comedy based on an ANTON CHEKHOV play of the same name.GREG comes to collect a debt owed to him by the widow POP OVA'S late husband.They argue , duel with pistols , and fall in love.All of this is witnessed by POP OVA'S SERVANT LUKA"""
text = preprocess_text(text)
text = [text]
pred1 = model.predict(text)
max_val = max(pred1[0])
for i in range(0,len(pred1[0])):
    if pred1[0][i] == max_val:
        print(return_genre(i))
        break;
# Expected -> comedy

comedy
