In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import string
import numpy as np
#nltk.download('stopwords')
#nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')

In [2]:
filename = "IMDB Dataset.csv"
df = pd.read_csv(filename)

In [3]:
len(df)

50000

In [4]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


# Data Cleaning

1. Remove html tags
2. Check if stop words are necessary if not remove them
3. remove punctuations

In [5]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    return clean_text

df["review"] = df["review"].apply(remove_html_tags)
df.head()



Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
# check if we need to remove stopwords or not
#let's check with some commmon words which would be hugely significant in determining the sentiments of reviewrs.
sentiment_words = ["good","bad","worst","lovely","joyful","happy"]
words_present = [word for word in sentiment_words if word in stopwords]
print(words_present)


[]


Looks like we can remove them safely them

In [7]:
#removing punctuation
def remove_punctuations(s):
    s = "".join([i for i in s if i not in string.punctuation])
    return s

df["review"] = df["review"].apply(remove_punctuations)

#Tokenizing into word tokens
df["review"] = df["review"].apply(word_tokenize) 

In [9]:
#removing stop words now
def remove_stopwords(s):
    s = " ".join(each for each in s if each not in stopwords)
    return s
df["review"] = df["review"].apply(remove_stopwords) 

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,1
1,A wonderful little production The filming tech...,1
2,I thought wonderful way spend time hot summer ...,1
3,Basically theres family little boy Jake thinks...,0
4,Petter Matteis Love Time Money visually stunni...,1


In [11]:
#creating numerical representation of labels
def Convert_to_bin(text, remove_digits=True):
  if text=='positive': 
      text= 1   
  else: 
      text=0
  return text

df["sentiment"] = df["sentiment"].apply(Convert_to_bin)

In [13]:
#split data into into train and test sets 
from sklearn.model_selection import train_test_split
X=df['review'].values
Y=df['sentiment'].values
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.3)

Y_train = np.asarray(Y_train).astype('float32').reshape((-1,1))

# Count Vectorizer

In [None]:
### Extracting features using Count Vectorizer

#
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_features = count_vectorizer.fit_transform(X_train)

feature_names = count_vectorizer.get_feature_names_out()

x_train =count_features
x_test = count_vectorizer.transform(X_test)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Dense(16, input_dim=x_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])

# Train the model
# Convert your sparse input data to a correctly ordered sparse tensor


model.fit(X_train_sparse, Y_train, batch_size=32, epochs=10, validation_data=(x_test, Y_test))

# Word Embedding

## Traning your own Embedding

In [14]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train) #Fitting a tokenizer on the corpus by considering 10,000 as size of vocavulary but keeping first 10,000 words in a sorted words according to their frequency

In [15]:
#Tokenizing each sentence with the words numerical representation by considering only first 10000 words.
x_train = tokenizer.texts_to_sequences(X_train)
x_test  = tokenizer.texts_to_sequences(X_test)

In [16]:
#We are padding all sentences to a length of max length 100.
vocab = len(tokenizer.word_index) + 1
from tensorflow.keras.preprocessing.sequence import pad_sequences
maxlen = 100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D
from tensorflow.keras.optimizers import Adam
emb_dim=100
model= Sequential()
model.add(Embedding(input_dim=vocab, output_dim=emb_dim, input_length=maxlen))
model.add(MaxPool1D())
model.add(Dense(16,activation="relu"))
model.add(Dense(16,activation="relu"))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          17745600  
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 100)          0         
 )                                                               
                                                                 
 dense (Dense)               (None, 50, 16)            1616      
                                                                 
 dense_1 (Dense)             (None, 50, 16)            272       
                                                                 
 dense_2 (Dense)             (None, 50, 1)             17        
                                                                 
Total params: 17,747,505
Trainable params: 17,747,505
Non-trainable params: 0
____________________________________________

In [None]:
history = model.fit(x_train, Y_train,epochs=35,verbose=True,batch_size=16)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35

In [None]:
#https://towardsdatascience.com/a-guide-to-text-classification-and-sentiment-analysis-2ab021796317 