In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from tensorflow.keras.layers import Dense, LSTM, ReLU, Flatten, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#reading data
file = open("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt", encoding='utf-8', errors='ignore').read().split('\n')

In [4]:
#size of data
len(file)

54215

# Creating Dataset

In [5]:
df = pd.DataFrame()

In [6]:
def split_data(texts):
    genre = []
    desc = []
    for text in texts:
        splited = text.split(":::")
        genre.append(splited[-2])
        desc.append(splited[-1])
    return genre, desc

In [7]:
genre, desc = split_data(file[0:10000])

In [8]:
#size of data
len(genre), len(desc)

(10000, 10000)

In [9]:
#creating dataframe for data
df['genre'] = genre
df['desc'] = desc

In [10]:
df.head()

Unnamed: 0,genre,desc
0,drama,Listening in to a conversation between his do...
1,thriller,A brother and sister with a past incestuous r...
2,adult,As the bus empties the students for their fie...
3,drama,To help their unemployed father make ends mee...
4,drama,The film's title refers not only to the un-re...


In [11]:
df['desc'][0]

' Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.'

# Data Preprocessing

In [12]:
#initialize the stemmer and lemmentizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [13]:
#filtering the text
def full_form(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,!"(\)\/[\]]', ' ', text)
    plain = plain.replace("don't", "do not")
    plain = plain.replace("won't", "will not")
    plain = plain.replace("haven't", "have not")
    plain = plain.replace("can't", "cannot")
    plain = plain.replace("she's", "she is")
    plain = plain.replace("he's", "he is")
    plain = plain.replace("there're", "there are")
    plain = plain.replace("they'd", "they would")
    plain = plain.replace("\'ll", " will")
    return plain   

In [14]:
#preprocessing  text 
def remove_stop_words(data):
  corpus = []
  for i in range(0, len(data)):
    #split the sentence
    plain = full_form(data[i])
    sentence = plain.split()
    
    #check and stem the word
    review_processed = [stemmer.stem(word) for word in sentence if not word in stopwords.words('english')]
    #rebuild the sentence
    review_joint = ' '.join(review_processed)
    #add the sentence into list
    corpus.append(review_joint)
  return corpus

In [15]:
processed_desc = remove_stop_words(df['desc'])

In [16]:
#sample length for getting overall idea for max length of sentence
length = [len(processed_desc[i].split()) for i in range(40)]
length

[54,
 15,
 56,
 109,
 58,
 170,
 33,
 31,
 42,
 43,
 37,
 18,
 73,
 29,
 47,
 26,
 33,
 48,
 206,
 28,
 35,
 19,
 88,
 46,
 28,
 30,
 56,
 28,
 34,
 74,
 43,
 63,
 65,
 18,
 57,
 41,
 63,
 41,
 24,
 57]

In [17]:
#filtering the genre
def filter_text(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,@#$%&*!"(\)\/[\]]', '', text)
    return plain

In [18]:
filtered_genre = []
for gnre in df['genre']:
    filtered_genre.append(filter_text(gnre))

In [19]:
len(filtered_genre), len(genre)

(10000, 10000)

In [20]:
df['preprocessed_desc'] = processed_desc
df['preprocessed_genre'] = filtered_genre

In [21]:
df.shape

(10000, 4)

# Vectorization

In [22]:
#vocabulary size
voc_size = 10000

In [23]:
#convert into one hot vector
onehot_text = [one_hot(word, voc_size) for word in df['preprocessed_desc']]

In [24]:
print(len(onehot_text[0]))
print(len(df['preprocessed_desc'][0].split()))

54
54


In [25]:
#unique values of genre
len(df['preprocessed_genre'].unique())

27

In [26]:
#y train data
onehot_genre = [one_hot(char,27) for char in df['preprocessed_genre']]

In [27]:
for i in range(40):
    print(f'{i} - {onehot_genre[i]} --> {filtered_genre[i]}')

0 - [11] -->  drama 
1 - [17] -->  thriller 
2 - [17] -->  adult 
3 - [11] -->  drama 
4 - [11] -->  drama 
5 - [16] -->  documentary 
6 - [23] -->  comedy 
7 - [7] -->  crime 
8 - [3] -->  realitytv 
9 - [7] -->  horror 
10 - [16] -->  documentary 
11 - [11] -->  drama 
12 - [16] -->  documentary 
13 - [17] -->  thriller 
14 - [11] -->  drama 
15 - [11] -->  drama 
16 - [23] -->  comedy 
17 - [16] -->  documentary 
18 - [23] -->  sport 
19 - [1] -->  animation 
20 - [11] -->  drama 
21 - [23] -->  comedy 
22 - [23] -->  comedy 
23 - [11] -->  drama 
24 - [3] -->  action 
25 - [24] -->  fantasy 
26 - [1] -->  short 
27 - [19] -->  scifi 
28 - [17] -->  thriller 
29 - [16] -->  documentary 
30 - [7] -->  horror 
31 - [16] -->  documentary 
32 - [3] -->  action 
33 - [16] -->  documentary 
34 - [15] -->  music 
35 - [23] -->  comedy 
36 - [11] -->  drama 
37 - [11] -->  drama 
38 - [23] -->  comedy 
39 - [23] -->  comedy 


# Embedding

In [28]:
#sentence length
sent_length = 250

In [29]:
#embedding
embedd_docs = pad_sequences(onehot_text, padding='pre', maxlen=sent_length)

In [30]:
embedd_docs.shape[1]

250

In [31]:
#shape of data
len(embedd_docs), len(onehot_genre)

(10000, 10000)

In [32]:
#convert into numpy array
X_data = np.array(embedd_docs)
y_data = np.array(onehot_genre)

In [33]:
X_data.shape, y_data.shape

((10000, 250), (10000, 1))

In [34]:
#split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3)

In [35]:
#make y train array like categorical [000000100000]
y_train = to_categorical(y_train, num_classes=27)
y_test = to_categorical(y_test, num_classes=27)

In [36]:
#shape of train and test data
print(f'training: {X_train.shape} - {y_train.shape}')
print(f'testing: {X_test.shape} - {y_test.shape}')

training: (7000, 250) - (7000, 27)
testing: (3000, 250) - (3000, 27)


# Model

In [37]:
#embed vector that represents each token by 40 featured vector
embed_vect = 200
model = Sequential()
model.add(Embedding(voc_size, embed_vect, input_length=sent_length))
model.add(LSTM(512, return_sequences=True, input_shape=(250,1)))
#model.add(LSTM(512, return_sequences=True))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.2))
model.add(Dense(27, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 200)          2000000   
                                                                 
 lstm (LSTM)                 (None, 250, 512)          1460224   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 512)         0         
 ropout1D)                                                       
                                                                 
 lstm_1 (LSTM)               (None, 100)               245200    
                                                                 
 dense (Dense)               (None, 27)                2727      
                                                                 
Total params: 3,708,151
Trainable params: 3,708,151
Non-trainable params: 0
______________________________________________

In [38]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f153c288d90>

In [39]:
model.evaluate(X_test, y_test)



[3.4058990478515625, 0.3316666781902313]