In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, LSTM, ReLU, Flatten, Embedding, SpatialDropout1D, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#reading data
file = open("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt", encoding='utf-8', errors='ignore').read().split('\n')

In [4]:
#size of data
len(file)

54215

In [5]:
len(file[0].split(':::'))

4

# Creating Dataset

In [6]:
df = pd.DataFrame()

In [7]:
def split_data(texts):
    genre = []
    desc = []
    for text in texts:
        if len(text.split(':::')) == 4:
            splited = text.split(":::")
            genre.append(splited[-2])
            desc.append(splited[-1])
    return genre, desc

In [8]:
genre, desc = split_data(file)

In [9]:
#size of data
len(genre), len(desc)

(54214, 54214)

In [10]:
#creating dataframe for data
df['genre'] = genre
df['desc'] = desc

In [11]:
df.head()

Unnamed: 0,genre,desc
0,drama,Listening in to a conversation between his do...
1,thriller,A brother and sister with a past incestuous r...
2,adult,As the bus empties the students for their fie...
3,drama,To help their unemployed father make ends mee...
4,drama,The film's title refers not only to the un-re...


In [12]:
df['desc'][0]

' Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.'

# Data Preprocessing

In [13]:
#initialize the stemmer and lemmentizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [14]:
#filtering the text
def full_form(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,!"(\)\/[\]]', ' ', text)
    plain = plain.replace("don't", "do not")
    plain = plain.replace("won't", "will not")
    plain = plain.replace("haven't", "have not")
    plain = plain.replace("can't", "cannot")
    plain = plain.replace("she's", "she is")
    plain = plain.replace("he's", "he is")
    plain = plain.replace("there're", "there are")
    plain = plain.replace("they'd", "they would")
    plain = plain.replace("\'ll", " will")
    return plain   

In [15]:
#preprocessing  text 
def remove_stop_words(data):
  corpus = []
  for i in range(0, len(data)):
    #split the sentence
    plain = full_form(data[i])
    sentence = plain.split()
    
    #check and stem the word
    review_processed = [stemmer.stem(word) for word in sentence if not word in stopwords.words('english')]
    #rebuild the sentence
    review_joint = ' '.join(review_processed)
    #add the sentence into list
    corpus.append(review_joint)
  return corpus

In [16]:
processed_desc = remove_stop_words(df['desc'])

In [17]:
#sample length for getting overall idea for max length of sentence
length = [len(processed_desc[i].split()) for i in range(40)]
length

[54,
 15,
 56,
 109,
 58,
 170,
 33,
 31,
 42,
 43,
 37,
 18,
 73,
 29,
 47,
 26,
 33,
 48,
 206,
 28,
 35,
 19,
 88,
 46,
 28,
 30,
 56,
 28,
 34,
 74,
 43,
 63,
 65,
 18,
 57,
 41,
 63,
 41,
 24,
 57]

In [18]:
#filtering the genre
def filter_text(text):
    text = text.lower()
    plain = re.sub(r'[-<>?\.,@#$%&*!"(\)\/[\]]', '', text)
    return plain

In [19]:
filtered_genre = []
for gnre in df['genre']:
    filtered_genre.append(filter_text(gnre))

In [20]:
len(filtered_genre), len(genre)

(54214, 54214)

In [21]:
df['preprocessed_desc'] = processed_desc
df['preprocessed_genre'] = filtered_genre

In [22]:
# Add a new column with the length of each sentence
df['length'] = df['preprocessed_desc'].apply(lambda x: len(x.split()))

In [23]:
# Sort the DataFrame based on the length column
df = df.sort_values('length')

In [24]:
# Reset the index
df = df.reset_index()

In [25]:
df.head(10)

Unnamed: 0,index,genre,desc,preprocessed_desc,preprocessed_genre,length
0,4754,drama,Details of the plot are being kept under wraps.,detail plot kept wrap,drama,4
1,52224,comedy,A buddy film set in the world of fashion.,buddi film set world fashion,comedy,5
2,12361,documentary,The story of the Irish National Baseball Team.,stori irish nation basebal team,documentary,5
3,40382,drama,The journey of a couple during their first ex...,journey coupl first experi swinger' parti,drama,6
4,30635,comedy,A journalist finds he can see ghosts after be...,journalist find see ghost struck lightn,comedy,6
5,48605,short,Documental sobre canciones de Enric Barbat.,document sobr cancion de enric barbat,short,6
6,23669,short,Documental rodado sobre fotografías fijas de ...,document rodado sobr fotografía fija de venecia,short,7
7,30687,documentary,A portrait of three transsexuals living in th...,portrait three transsexu live islam republ iran,documentary,7
8,47519,short,Documental sobre la Estación de Chamartín (Ma...,document sobr la estación de chamartín madrid,short,7
9,33551,fantasy,A woman realizes that her fate is connected t...,woman realiz fate connect two men soulmat,fantasy,7


In [26]:
df['label'], _ = pd.factorize(df['preprocessed_genre'])

In [27]:
df.shape

(54214, 7)

In [28]:
len(df['label'].unique())

27

In [29]:
df['preprocessed_genre'].unique(), len(df['preprocessed_genre'].unique())

(array([' drama ', ' comedy ', ' documentary ', ' short ', ' fantasy ',
        ' music ', ' musical ', ' adult ', ' biography ', ' scifi ',
        ' action ', ' thriller ', ' animation ', ' western ', ' crime ',
        ' romance ', ' sport ', ' history ', ' horror ', ' talkshow ',
        ' realitytv ', ' adventure ', ' family ', ' mystery ',
        ' gameshow ', ' war ', ' news '], dtype=object),
 27)

# Embedding

In [30]:
#sentence length
sent_length = 250
#vocabulary size
voc_size = 10000

In [31]:
#convert into one hot vector
onehot_text = [one_hot(word, voc_size) for word in df['preprocessed_desc']]

In [32]:
len(onehot_text[0]), len(df['preprocessed_desc'][0].split())   

(4, 4)

In [33]:
#embedding
embedd_docs = pad_sequences(onehot_text, padding='pre', maxlen=sent_length)

In [34]:
embedd_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [35]:
y = df['label'].values

In [36]:
#change y labels into categorical value [1] -> [1 0 0 0 0 0 0...0]
y_label = to_categorical(y)

In [37]:
#shape of data
len(embedd_docs), len(y_label)

(54214, 54214)

In [38]:
#convert into numpy array
X_data = np.array(embedd_docs)
y_data = np.array(y_label)

In [39]:
X_data.shape, y_data.shape

((54214, 250), (54214, 27))

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X_data,y_data, test_size = 0.2)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(43371, 250) (43371, 27)
(10843, 250) (10843, 27)


In [41]:
print(Y_train[0])
print(len(Y_train[0]))

[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
27


# Model

In [42]:
# Creating model
embedding_vector_features=300
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(1024, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dense(27,activation='sigmoid'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 300)          3000000   
                                                                 
 lstm (LSTM)                 (None, 250, 1024)         5427200   
                                                                 
 dropout (Dropout)           (None, 250, 1024)         0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               1311744   
                                                                 
 dense (Dense)               (None, 27)                6939      
                                                                 
Total params: 9,745,883
Trainable params: 9,745,883
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f98183ca7d0>

# Evaluate

In [44]:
model.evaluate(X_test, Y_test)



[2.1787800788879395, 0.4723784923553467]

In [45]:
sample = X_test[0].reshape(1, 250)
sample.shape

(1, 250)

In [46]:
re = (model.predict(sample) > 0.98).astype("int32")



In [47]:
re

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]], dtype=int32)

In [48]:
Y_test[0]

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

# save model

In [49]:
model.save('movie-genre-classification-v2.h5')