In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from tensorflow.keras.layers import Dense, LSTM, ReLU, Flatten, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#reading data
file = open("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt", encoding='utf-8', errors='ignore').read().split('\n')

In [4]:
#size of data
len(file)

54215

# Data Preprocessing

In [5]:
#initialize the stemmer and lemmentizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [6]:
def split_data(texts):
    genre = []
    desc = []
    for text in texts:
        splited = text.split(":::")
        genre.append(splited[-2])
        desc.append(splited[-1])
    return genre, desc

In [7]:
#filtering the text
def full_form(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,!"(\)\/[\]]', ' ', text)
    plain = plain.replace("don't", "do not")
    plain = plain.replace("won't", "will not")
    plain = plain.replace("haven't", "have not")
    plain = plain.replace("can't", "cannot")
    plain = plain.replace("she's", "she is")
    plain = plain.replace("he's", "he is")
    plain = plain.replace("there're", "there are")
    plain = plain.replace("they'd", "they would")
    plain = plain.replace("\'ll", " will")
    return plain   

In [8]:
#preprocessing  text 
def remove_stop_words(data):
  corpus = []
  for i in range(0, len(data)):
    #split the sentence
    plain = full_form(data[i])
    sentence = plain.split()
    
    #check and stem the word
    review_processed = [stemmer.stem(word) for word in sentence if not word in stopwords.words('english')]
    #rebuild the sentence
    review_joint = ' '.join(review_processed)
    #add the sentence into list
    corpus.append(review_joint)
  return corpus

In [9]:
genre, desc = split_data(file[0:5000])

In [10]:
processed_desc = remove_stop_words(desc)

In [11]:
#sample length for getting overall idea for max length of sentence
length = [len(processed_desc[i].split()) for i in range(40)]
length

[54,
 15,
 56,
 109,
 58,
 170,
 33,
 31,
 42,
 43,
 37,
 18,
 73,
 29,
 47,
 26,
 33,
 48,
 206,
 28,
 35,
 19,
 88,
 46,
 28,
 30,
 56,
 28,
 34,
 74,
 43,
 63,
 65,
 18,
 57,
 41,
 63,
 41,
 24,
 57]

In [12]:
#filtering the genre
def filter_text(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,!"(\)\/[\]]', '', text)
    return plain

In [13]:
filtered_genre = []
for gnre in genre:
    filtered_genre.append(filter_text(gnre))

In [14]:
len(filtered_genre), len(genre)

(5000, 5000)

# Vectorization

In [15]:
#vocabulary size
voc_size = 10000

In [16]:
#convert into one hot vector
onehot_text = [one_hot(word, voc_size) for word in processed_desc]

In [17]:
#unique genre count
unique_count_genre = len(set(genre))
unique_count_genre

27

In [18]:
#y train data
onehot_genre = [one_hot(char,27) for char in filtered_genre]

In [19]:
for i in range(40):
    print(f'{i} - {onehot_genre[i]} --> {filtered_genre[i]}')

0 - [6] -->  drama 
1 - [19] -->  thriller 
2 - [8] -->  adult 
3 - [6] -->  drama 
4 - [6] -->  drama 
5 - [18] -->  documentary 
6 - [6] -->  comedy 
7 - [5] -->  crime 
8 - [1] -->  realitytv 
9 - [23] -->  horror 
10 - [18] -->  documentary 
11 - [6] -->  drama 
12 - [18] -->  documentary 
13 - [19] -->  thriller 
14 - [6] -->  drama 
15 - [6] -->  drama 
16 - [6] -->  comedy 
17 - [18] -->  documentary 
18 - [25] -->  sport 
19 - [13] -->  animation 
20 - [6] -->  drama 
21 - [6] -->  comedy 
22 - [6] -->  comedy 
23 - [6] -->  drama 
24 - [12] -->  action 
25 - [16] -->  fantasy 
26 - [22] -->  short 
27 - [11] -->  scifi 
28 - [19] -->  thriller 
29 - [18] -->  documentary 
30 - [23] -->  horror 
31 - [18] -->  documentary 
32 - [12] -->  action 
33 - [18] -->  documentary 
34 - [17] -->  music 
35 - [6] -->  comedy 
36 - [6] -->  drama 
37 - [6] -->  drama 
38 - [6] -->  comedy 
39 - [6] -->  comedy 


# Embedding

In [20]:
#sentence length
sent_length = 250

In [21]:
#embedding
embedd_docs = pad_sequences(onehot_text, padding='post', maxlen=sent_length)

In [22]:
embedd_docs.shape[1]

250

In [23]:
#shape of data
len(embedd_docs), len(onehot_genre)

(5000, 5000)

In [24]:
#convert into numpy array
X_data = np.array(embedd_docs)
y_data = np.array(onehot_genre)

In [25]:
X_data.shape, y_data.shape

((5000, 250), (5000, 1))

In [26]:
#split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

In [27]:
#make y train array like categorical [000000100000]
y_train = to_categorical(y_train, num_classes=27)
y_test = to_categorical(y_test, num_classes=27)

In [28]:
#shape of train and test data
print(f'training: {X_train.shape} - {y_train.shape}')
print(f'testing: {X_test.shape} - {y_test.shape}')

training: (3500, 250) - (3500, 27)
testing: (1500, 250) - (1500, 27)


# Model

In [29]:
#embed vector that represents each token by 40 featured vector
embed_vect = 50
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(250,1)))
#model.add(LSTM(512, return_sequences=True))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.2))
model.add(Dense(27, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 250, 512)          1052672   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 512)         0         
 ropout1D)                                                       
                                                                 
 lstm_1 (LSTM)               (None, 100)               245200    
                                                                 
 dense (Dense)               (None, 27)                2727      
                                                                 
Total params: 1,300,599
Trainable params: 1,300,599
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f04f41dc210>