<h3><b>Text Classification using Tensorflow and Gensim<b/><h3/>

IMPORT REQUIRED PACKAGES

In [4]:
import numpy as np # linear algebra
import pandas as pan # data processing, CSV file I/O (e.g. pd.read_csv)
import gensim
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.layers import Embedding, Dense, Flatten
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.utils import pad_sequences

LOAD DATASET

In [5]:
#read in the data
data_df = pan.read_csv("sample_data/train.csv")
test_df = pan.read_csv("sample_data/test.csv")

In [6]:
data_df.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [7]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54000 entries, 0 to 53999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54000 non-null  int64 
 1   movie_name  54000 non-null  object
 2   synopsis    54000 non-null  object
 3   genre       54000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


DATA PREPROCESSING

In [8]:
#preprocessing
data_df.drop(columns = ["id"], inplace = True)
test_df.drop(columns = ["id"], inplace = True)

#merge the movie_name and synopsis cols
data_df["All text"] = np.vectorize(lambda x, y: x + " : " + y)(data_df["movie_name"], data_df["synopsis"])
test_df["All text"] = np.vectorize(lambda x, y: x + " : " + y)(test_df["movie_name"], test_df["synopsis"])



In [13]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
#remove stopwords and tokenize the strings
stop_words = nltk.corpus.stopwords.words("english")
stop_words.extend([":", "!", ".", "?", ";", ",", "-"])

def tokenize_and_strip_stops(text):
    txt = nltk.tokenize.word_tokenize(text)
    filtered = [i for i in txt if i not in stop_words]
    return " ".join(filtered)

data_df["All text"] = np.vectorize(tokenize_and_strip_stops)(data_df["All text"])
test_df["All text"] = np.vectorize(tokenize_and_strip_stops)(test_df["All text"])
data_df.drop(columns = ["movie_name", "synopsis"], inplace = True)
test_df.drop(columns = ["movie_name", "synopsis"], inplace = True)

data_df.head(5)

Unnamed: 0,genre,All text
0,fantasy,Super Me A young scriptwriter starts bringing ...
1,horror,Entity Project A director friends renting haun...
2,family,Behavioral Family Therapy Serious Psychiatric ...
3,scifi,Blood Glacier Scientists working Austrian Alps...
4,action,Apat na anino Buy Day Four Men Widely Apart Li...


CREATE WORD2VEC EMBEDDING

In [16]:
features = data_df["All text"]
label = data_df["genre"]

features_for_gensim = [i.split() for i in features]

word_2_vec_learner = gensim.models.Word2Vec(sentences = features_for_gensim, vector_size = 100, window = 5, min_count = 2, workers = 4, sg =1)

In [None]:
#check first 2 features
features_for_gensim[:2]

In [None]:
#love_syn = word_2_vec_learner.wv.most_similar("joy")
#love_syn #Beautiful!

In [None]:
dir(word_2_vec_learner)

In [17]:
#get wordvector
vectors = word_2_vec_learner.wv

In [None]:
dir(vectors)

In [24]:
#confirming vector size
len(vectors['A'])

100

In [20]:
vocabulary = list(vectors.index_to_key)
vocabulary[: 5]

['A', 'The', "'s", 'young', 'man']

In [32]:
#create embedding matrix for our vocabulary (len(vocabulary), 100)
embed_mat = np.zeros((len(vocabulary), vectors.vector_size))

for i, w in enumerate(vocabulary):
    embed_mat[i] = vectors[w]

In [40]:
#flip index-word to word-index for vocabulary
word_to_index = {w: i for i, w in enumerate(vocabulary)}

#convert each word into its index in our vocabulary
features = [[word_to_index.get(i, 0) for i in entry] for entry in features_for_gensim]

In [36]:
#confirm changes
features[:2]

[[1164,
  504,
  0,
  3,
  19372,
  226,
  2532,
  1992,
  6311,
  36,
  612,
  1137,
  2560,
  672,
  23331,
  253,
  269],
 [9339, 517, 0, 762, 25, 23330, 221, 71, 1065, 1071, 134, 78, 566, 77, 818]]

DATA PREPARATION FOR ML

In [41]:
#map function to encode label
mapper = {l: i for i, l in enumerate(label.unique())}

In [42]:
#split dataset and pad sequence(pad_sequences is used to ensure all our features contain the same number of element, i.e have same size)
feature_train, feature_test, label_train, label_test = train_test_split(features, label, test_size = 0.2, random_state = 42)
feature_train_dataset = pad_sequences(feature_train, padding = "post", value = 0, maxlen = 65)
feature_test_dataset = pad_sequences(feature_test, padding = "post", value = 0, maxlen = 65)

#encoding the label
label_train_dataset = tf.keras.utils.to_categorical(label_train.map(mapper), 10)
label_test_dataset = tf.keras.utils.to_categorical(label_test.map(mapper), 10)

BUILDING TENSORFLOW MODEL

In [50]:
#build model
text_learner = Sequential()
text_learner.add(Embedding(
                input_dim = len(vocabulary),
                output_dim = vectors.vector_size,
                weights = [embed_mat],
                input_length = 65,
                trainable = False))
text_learner.add(tf.keras.layers.Conv1D(65, 5, activation='relu', padding='same'))
text_learner.add(tf.keras.layers.MaxPooling1D(pool_size=3,padding='same'))
text_learner.add(Flatten())
text_learner.add(Dense(units = 256, activation = "relu"))
text_learner.add(Dense(units = 512, activation = "relu"))
text_learner.add(Dense(units = 10, activation = "softmax"))
text_learner.compile(loss = "categorical_crossentropy", optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5), metrics = ["accuracy"])
text_learner.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 65, 100)           4236600   
                                                                 
 conv1d (Conv1D)             (None, 65, 65)            32565     
                                                                 
 max_pooling1d_3 (MaxPoolin  (None, 22, 65)            0         
 g1D)                                                            
                                                                 
 flatten_7 (Flatten)         (None, 1430)              0         
                                                                 
 dense_7 (Dense)             (None, 256)               366336    
                                                                 
 dense_8 (Dense)             (None, 512)               131584    
                                                      

In [51]:
callb = EarlyStopping(patience = 5)
text_learner.fit(x = feature_train_dataset, y = label_train_dataset,  validation_data = (feature_test_dataset, label_test_dataset), epochs = 40, callbacks = [callb], batch_size = 64, validation_batch_size = 512)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7c785d5c51b0>

There is a increase in the accuracy and validation accuracy;
<li>Hyperparameter tuning</li>
<li>Trying more layers</li>
<li>Training for a longer epoch</li>
would all lead to improvement