# Text Classification using BiLSTM - Genre Classification Dataset IMDB
Dataset - https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb

# Converting data into dataframe

In [1]:
import numpy as np
import pandas as pd

file_path_train = 'genre_classification_train.txt'
file_path_test = 'genre_classification_test.txt'

# Read the data from the file and split it into lines
with open(file_path_train, 'r', encoding='utf-8') as file:
    lines_train = file.readlines()
with open(file_path_test, 'r', encoding='utf-8') as file:
    lines_test = file.readlines()

# Initialize lists to store data
movie_names_train = []
genres_train = []
reviews_train = []
movie_names_test = []
genres_test = []
reviews_test = []

# Process each line and extract relevant information
for line in lines_train:
    parts = line.split(' ::: ')
    movie_names_train.append(parts[1])
    genres_train.append(parts[2])
    reviews_train.append(parts[3].strip())
for line in lines_test:
    parts = line.split(' ::: ')
    movie_names_test.append(parts[1])
    genres_test.append(parts[2])
    reviews_test.append(parts[3].strip())

# Create a DataFrame
df1 = pd.DataFrame({
    'Movie Name': movie_names_train,
    'Genre': genres_train,
    'Review': reviews_train
})
df2 = pd.DataFrame({
    'Movie Name': movie_names_test,
    'Genre': genres_test,
    'Review': reviews_test
})

In [2]:
df1.head()

Unnamed: 0,Movie Name,Genre,Review
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


# Checking for null values and duplicates

In [3]:
null_values1 = df1.isnull().sum()
null_values2 = df2.isnull().sum()
print(null_values1)
print(null_values2)

Movie Name    0
Genre         0
Review        0
dtype: int64
Movie Name    0
Genre         0
Review        0
dtype: int64


In [4]:
duplicate_rows1 = df1.duplicated().sum()
duplicate_rows2 = df2.duplicated().sum()
print(duplicate_rows1)
print(duplicate_rows2)

0
0


# Text Preprocessing
remove numbers, lowercasing, remove html tags, remove urls, remove punctuation, chat word treatment, removing stop words, handling emojis, tokenization, lemmatization

In [5]:
import nltk
import re
import html
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Chat word treatment (customize as needed)
    chat_words_map = {'gr8': 'great', 'u': 'you', 'r': 'are', 'lol': 'laughing out loud'}
    text = ' '.join(chat_words_map.get(word, word) for word in text.split())

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a string
    text = ' '.join(words)

    return text


In [7]:
df1['Review'] = df1['Review'].apply(preprocess_text)
df2['Review'] = df2['Review'].apply(preprocess_text)

  text = BeautifulSoup(text, 'html.parser').get_text()


In [8]:
df1['Review'].fillna('',inplace=True)
df2['Review'].fillna('',inplace=True)

In [9]:
df1['Genre'].value_counts().index

Index(['drama', 'documentary', 'comedy', 'short', 'horror', 'thriller',
       'action', 'western', 'reality-tv', 'family', 'adventure', 'music',
       'romance', 'sci-fi', 'adult', 'crime', 'animation', 'sport',
       'talk-show', 'fantasy', 'mystery', 'musical', 'biography', 'history',
       'game-show', 'news', 'war'],
      dtype='object')

# Replacing "genre" with integer values
drama -> 1
action -> 2 and so on

In [10]:
i = 1;
mp = {}
for genres in df1['Genre'].value_counts().index:
    mp[genres] = i;
    i = i+1;
def func(text):
    return mp[text]
df1['Genre'] = df1['Genre'].apply(func)
df2['Genre'] = df2['Genre'].apply(func)

In [11]:
df1.head()

Unnamed: 0,Movie Name,Genre,Review
0,Oscar et la dame rose (2009),1,listening conversation doctor parent yearold o...
1,Cupid (1997),6,brother sister past incestuous relationship cu...
2,"Young, Wild and Wonderful (1980)",15,bus empty student field trip museum natural hi...
3,The Secret Sin (1915),1,help unemployed father make end meet edith twi...
4,The Unrecovered (2007),1,film title refers unrecovered body ground zero...


# Train test split

In [12]:
x_train = df1['Review'].tolist()
y_train = df1['Genre'].tolist()
x_test = df2['Review'].tolist()
y_test = df2['Genre'].tolist()

# Converting textual data into vectors

Find total number of unique genres

In [13]:
unique_genre = 0;
unique_genre = df1['Genre'].value_counts().index.size
unique_genre

27

In [14]:
# The Tokenizer is configured to keep the top 20,000 most frequent words in the vocabulary.
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=20000)

In [15]:
tokenizer.fit_on_texts(x_train)

In [16]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [17]:
from keras.utils import to_categorical

In [18]:
y_train = to_categorical(y_train, unique_genre+1)
y_test = to_categorical(y_test, unique_genre+1)

In [19]:
y_train

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [20]:
# Finding maxlen
maxlen = 0;
for i in range(len(x_train)):
    maxlen = max(maxlen,len(x_train[i]))
print(maxlen)

1054


Post Padding

In [21]:
from keras.utils import pad_sequences
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen+100)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen+100)

In [22]:
x_train

array([[ 4676,  1092,   286, ...,     0,     0,     0],
       [   64,   119,    91, ...,     0,     0,     0],
       [ 1443,  1815,   115, ...,     0,     0,     0],
       ...,
       [ 3768,   233,   681, ...,     0,     0,     0],
       [ 2075, 12143,    58, ...,     0,     0,     0],
       [ 2296,    20,  1002, ...,     0,     0,     0]], dtype=int32)

In [23]:
y_train

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [24]:
x_train.shape

(54214, 1154)

In [25]:
y_train.shape

(54214, 28)

# Model Building

In [26]:
from keras import Sequential
from keras.layers import Dense,Embedding,LSTM,Bidirectional

In [31]:
from keras.layers import Dropout
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=10, input_length=x_train.shape[1])) # every word will be represented using 10 numbers
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64,return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32,return_sequences=False)))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1],activation='softmax'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1154, 10)          200000    
                                                                 
 bidirectional_5 (Bidirecti  (None, 1154, 256)         142336    
 onal)                                                           
                                                                 
 dropout_3 (Dropout)         (None, 1154, 256)         0         
                                                                 
 bidirectional_6 (Bidirecti  (None, 1154, 128)         164352    
 onal)                                                           
                                                                 
 dropout_4 (Dropout)         (None, 1154, 128)         0         
                                                                 
 bidirectional_7 (Bidirecti  (None, 64)               

In [32]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [39]:
from keras.callbacks import EarlyStopping

batch_size = 30
early_stop = EarlyStopping(monitor = 'val_loss',
                           patience = 5)
history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=30,
    callbacks = [early_stop],
    validation_data=(x_test, y_test))

In [40]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.show()

In [35]:
def doPrediction(text):
  text = preprocess_text(text)
  tokenizer.fit_on_texts(text)
  text = tokenizer.texts_to_sequences(text)
  text = pad_sequences(text, padding='post', maxlen=maxlen+100)
  predictions = model.predict(text)
  predicted_class_index = predictions.argmax(axis=-1)
  val = predicted_class_index[0]
  for key,value in mp.items():
    if value==val:
      return key;

# Prediction

In [41]:
# Prediction 1
text = "Zahra's biggest dream is to gain education and become a midwife. But since her father Reza is already looking forward to marry her off as soon as she reaches the puberty, and believes that going to school is a shame for the girl of her age, she would have to struggle a lot to make her dream come true. Neither she nor her father expects the shame to fall on the family, after Zahra is being sexually assaulted on her way from school and then forced by the village elders into marriage with the culprit, as per customary law. Her decision to elope will put Reza in debts for years and push his elder son, Zaker, to reach for extreme means, resulting in the tragedy they could never expect. The story - set up in a remote village of Afghanistan - is a call for action to put the end to discrimination against girls and respect their rights, including right to education, reasoning that education of girls turns out to be the best investment of the community."
doPrediction(text)
# Expected -> drama

drama


In [42]:
# Prediction 2
text = "An old actress accepts a challenging role to interpret an unknown composer, who died in oblivion, whose radical work could have revolutionized the music. In an immersion trip to the hometown of her character, the actress meets a mysterious man, 35 years younger."
doPrediction(text)
# Expected -> romance

action


In [43]:
# Prediction 3
text = "Lenny Parker, a mysterious stranger becomes the landlord of a local pub. A ruthless gang who work for local businessman Ray Gleeson decide they want to take over the pub, spurred on by the fact that Lenny's barmaid Terri is the ex of Ray's psychotic son Dominic Gleeson. Dominic is out for revenge after being neglected by Terri, Lenny is the only obstacle standing in his way as Terri and her daughter Peewee find themselves drawn to Lenny. When Dominic and the gang go too far and attack Terri, Lenny is forced to face his darkest fears and go back to his old ways to protect his pub and newfound family - but how far will he go?"
doPrediction(text)
# Expected -> thriller

thriller
