In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, LSTM, ReLU, Flatten, Embedding, SpatialDropout1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#reading data
file = open("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt", encoding='utf-8', errors='ignore').read().split('\n')

In [4]:
#size of data
len(file)

54215

In [5]:
len(file[0].split(':::'))

4

# Creating Dataset

In [6]:
df = pd.DataFrame()

In [7]:
def split_data(texts):
    genre = []
    desc = []
    for text in texts:
        if len(text.split(':::')) == 4:
            splited = text.split(":::")
            genre.append(splited[-2])
            desc.append(splited[-1])
    return genre, desc

In [8]:
genre, desc = split_data(file)

In [9]:
#size of data
len(genre), len(desc)

(54214, 54214)

In [10]:
#creating dataframe for data
df['genre'] = genre
df['desc'] = desc

In [11]:
df.head()

Unnamed: 0,genre,desc
0,drama,Listening in to a conversation between his do...
1,thriller,A brother and sister with a past incestuous r...
2,adult,As the bus empties the students for their fie...
3,drama,To help their unemployed father make ends mee...
4,drama,The film's title refers not only to the un-re...


In [12]:
df['desc'][0]

' Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.'

# Data Preprocessing

In [13]:
#initialize the stemmer and lemmentizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [14]:
#filtering the text
def full_form(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,!"(\)\/[\]]', ' ', text)
    plain = plain.replace("don't", "do not")
    plain = plain.replace("won't", "will not")
    plain = plain.replace("haven't", "have not")
    plain = plain.replace("can't", "cannot")
    plain = plain.replace("she's", "she is")
    plain = plain.replace("he's", "he is")
    plain = plain.replace("there're", "there are")
    plain = plain.replace("they'd", "they would")
    plain = plain.replace("\'ll", " will")
    return plain   

In [15]:
#preprocessing  text 
def remove_stop_words(data):
  corpus = []
  for i in range(0, len(data)):
    #split the sentence
    plain = full_form(data[i])
    sentence = plain.split()
    
    #check and stem the word
    review_processed = [stemmer.stem(word) for word in sentence if not word in stopwords.words('english')]
    #rebuild the sentence
    review_joint = ' '.join(review_processed)
    #add the sentence into list
    corpus.append(review_joint)
  return corpus

In [16]:
processed_desc = remove_stop_words(df['desc'])

In [17]:
#sample length for getting overall idea for max length of sentence
length = [len(processed_desc[i].split()) for i in range(40)]
length

[54,
 15,
 56,
 109,
 58,
 170,
 33,
 31,
 42,
 43,
 37,
 18,
 73,
 29,
 47,
 26,
 33,
 48,
 206,
 28,
 35,
 19,
 88,
 46,
 28,
 30,
 56,
 28,
 34,
 74,
 43,
 63,
 65,
 18,
 57,
 41,
 63,
 41,
 24,
 57]

In [18]:
#filtering the genre
def filter_text(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,@#$%&*!"(\)\/[\]]', '', text)
    return plain

In [19]:
filtered_genre = []
for gnre in df['genre']:
    filtered_genre.append(filter_text(gnre))

In [20]:
len(filtered_genre), len(genre)

(54214, 54214)

In [21]:
df['preprocessed_desc'] = processed_desc
df['preprocessed_genre'] = filtered_genre

In [22]:
df.shape

(54214, 4)

# Embedding

In [23]:
#sentence length
sent_length = 250
#vocabulary size
voc_size = 10000

In [24]:
#tokenize
tokenizer = Tokenizer(num_words=voc_size, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['desc'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 138537 unique tokens.


In [25]:
X = tokenizer.texts_to_sequences(df['desc'].values)
X = pad_sequences(X, maxlen=sent_length)
print('Shape of data tensor:', X.shape)    


Shape of data tensor: (54214, 250)


In [26]:
Y = pd.get_dummies(df['genre']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (54214, 27)


In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(43371, 250) (43371, 27)
(10843, 250) (10843, 27)


# Model

In [28]:
#embed vector that represents each token by 40 featured vector
embed_vect = 200
model = Sequential()
model.add(Embedding(voc_size, embed_vect, input_length=sent_length))
#model.add(LSTM(512, return_sequences=True))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.2))
model.add(Dense(27, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 200)          2000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 200)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               120400    
                                                                 
 dense (Dense)               (None, 27)                2727      
                                                                 
Total params: 2,123,127
Trainable params: 2,123,127
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=3, batch_size=32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7936344f50>

# Evaluate

In [30]:
model.evaluate(X_test, Y_test)



[1.496853232383728, 0.5627593994140625]

In [31]:
df['desc'][0], df['genre'][0]

(' Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.',
 ' drama ')

In [32]:
new_complaint = df['desc'][0] 
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=sent_length)
pred = model.predict(padded)
labels = df['genre'][0]
print(pred)

[[0.04116823 0.05446128 0.01961859 ... 0.04585551 0.04626844 0.04835559]
 [0.0253008  0.04637397 0.02434156 ... 0.03832643 0.03027985 0.01775897]
 [0.04600786 0.05198115 0.02848066 ... 0.0372168  0.03151401 0.0488551 ]
 ...
 [0.02902771 0.03451932 0.02422602 ... 0.03589349 0.03206141 0.01806073]
 [0.0543987  0.05176242 0.02624742 ... 0.05546601 0.04964005 0.06691119]
 [0.04116824 0.05446128 0.01961859 ... 0.04585551 0.04626844 0.04835559]]


In [33]:
len(pred[0])

27