# NLP Lecture -06: Text Classification
- **Data set Link: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews**

In [1]:
# Importing library
import numpy as np  
import pandas as pd 
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


## i.) Using inbuilt APIs
- Link for NLP API: **https://nlpcloud.com/home/playground/**
- No need to build any model
- Require costs as per uses

## ii) Using Machine Learning
### *Text Vectorization:*
- Bag of words
- n-grams
- TF-IDF  
- Word2Vec

### *Algorithms:*
- Naive Bayes Classifier
- Random Forest Classification
- Support Vector Machine

In [2]:
# Reading dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Checking shape and size
df.shape, df.size

((50000, 2), 100000)

### Pre-Processing

In [4]:
# Making copy of dataset for 1st 10000 reviews
df1 = df.copy()[:10000]
df1.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# Checking sample review
df1.review[1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [6]:
df1.shape, df1.size
df1['sentiment'].value_counts().reset_index()

Unnamed: 0,sentiment,count
0,positive,5028
1,negative,4972


In [7]:
# Checking null and duplicates
df1.isnull().sum()           # Checking null values
df1.duplicated().sum()       # Checking duplicates
df1.drop_duplicates(inplace=True)    # Dropping duplicates

In [8]:
# 1.Removing HTML tags
import re
def remove_html_tags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)

# Applying on dataset
df1['review'] = df1['review'].apply(lambda x: remove_html_tags(x))
df1.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
# 2.Changing to lower case
df1['review'] = df1['review'].str.lower()
df1.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [10]:
# 3.Removing Punctuation
spchar = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def remove_spchar2(text):
    return text.translate(str.maketrans('', '', spchar))

# Applying on dataset
df1['review'] = df1['review'].apply(lambda x: remove_spchar2(x))
df1.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [11]:
# 4.Removing Stop Words
from nltk.corpus import stopwords
sw_list = set(stopwords.words('english'))

df1['review'] = df1['review'].apply(lambda x: ' '.join([item for item in str(x).split() if item.lower() not in sw_list]) if pd.notna(x) else x)
# df1['review'] = df1['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: " ".join(x))
df1.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


In [None]:
# 5.Tokenization/Stemming/Lemmatization 


In [12]:
# Creating a copy of pre-processed dataset for further use if required
df2 = df1.copy()
df2.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


In [13]:
# Separating x and y
x = df1.iloc[:,0:1]
y = df1['sentiment']

# Label encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
# Train/Test/Split
from sklearn.model_selection import train_test_split

x_train,x_test, y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=5)

In [15]:
x_train.shape, x_test.shape

((7986, 1), (1997, 1))

## Using BOW

In [20]:
# Applying BOW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

x_tr_bow = cv.fit_transform(x_train['review']).toarray()
x_te_bow = cv.transform(x_test['review']).toarray()

In [21]:
x_tr_bow.shape, x_te_bow.shape

((7986, 72610), (1997, 72610))

In [22]:
# Applying Gaussian Naive Bayes algorithm
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_tr_bow, y_train)

In [23]:
y_pred1 = gnb.predict(x_te_bow)     

In [25]:
# Evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix

print('Accuracy score:',accuracy_score(y_test, y_pred1))
confusion_matrix(y_test, y_pred1)

Accuracy score: 0.6329494241362043


array([[705, 277],
       [456, 559]])

In [26]:
# Applying Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_tr_bow, y_train)

In [27]:
y_pred2 = rf.predict(x_te_bow)

In [29]:
# Evaluation metrics

print('Accuracy score:',accuracy_score(y_test, y_pred2))
confusion_matrix(y_test, y_pred2)

Accuracy score: 0.8492739108662994


array([[839, 143],
       [158, 857]])

In [31]:
# Applying BOW with maximum features limit
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)

x_tr_bow = cv.fit_transform(x_train['review']).toarray()
x_te_bow = cv.transform(x_test['review']).toarray()

In [32]:
x_tr_bow.shape, x_te_bow.shape

((7986, 3000), (1997, 3000))

In [33]:
# Applying Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_tr_bow, y_train)

In [34]:
y_pred3 = rf.predict(x_te_bow)

In [36]:
# Evaluation metrics

print('Accuracy score:',accuracy_score(y_test, y_pred3))
confusion_matrix(y_test, y_pred3)

Accuracy score: 0.8432648973460191


array([[842, 140],
       [173, 842]])

## Using ngrams

In [37]:
# Applying BOW and ngrams with maximum features limits
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2), max_features=3000)        # We can do hyper-parameter tunning to get best accuracy score

x_tr_bow = cv.fit_transform(x_train['review']).toarray()
x_te_bow = cv.transform(x_test['review']).toarray()

In [38]:
# Applying Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_tr_bow, y_train)

In [39]:
y_pred4 = rf.predict(x_te_bow)

In [40]:
# Evaluation metrics

print('Accuracy score:',accuracy_score(y_test, y_pred4))
confusion_matrix(y_test, y_pred4)

Accuracy score: 0.8382573860791187


array([[841, 141],
       [182, 833]])

## Using TF-IDF

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

x_tr_tfidf = tfidf.fit_transform(x_train['review']).toarray()
x_te_tfidf = tfidf.transform(x_test['review']).toarray()

In [42]:
# Applying Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_tr_tfidf, y_train)

In [43]:
y_pred5 = rf.predict(x_te_tfidf)

In [44]:
# Evaluation metrics

print('Accuracy score:',accuracy_score(y_test, y_pred5))
confusion_matrix(y_test, y_pred5)

Accuracy score: 0.8512769153730596


array([[845, 137],
       [160, 855]])

## Using Word2Vec
**Note:**
- Working on same dataset
- Either we can use pre-trained Word2Vec model or we can build our own for our dataset
- For pre-trained Word2Vec atleast 80% words should match with our data
- For building own model data should be sufficient
- All the pre-processing steps will be same as above
- For training on our own data to build model **see; NLP_lec_05: Word2Vec**

In [45]:
df2 = df1.copy()
df2.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


In [46]:
# Importing library
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [47]:
# Tokenization
story = []
for doc in df2['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [48]:
# Model building
model = gensim.models.Word2Vec(
        window = 10,
        min_count = 2)

model.build_vocab(story)

In [49]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(5472614, 5901370)

In [50]:
len(model.wv.index_to_key)

35180

In [51]:
# Defining function
def document_vector(doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [52]:
# Checking 1st review in vector with its dimension
document_vector(df2['review'].values[0])
len(document_vector(df2['review'].values[0]))

100

In [53]:
from tqdm import tqdm
import time
start_time = time.time()

X = []
for doc in tqdm(df2['review'].values):
    X.append(document_vector(doc))

X = np.array(X)

end_time = time.time()
runtime = end_time - start_time

print(f"Runtime: {runtime} seconds")
print(f"Shape of X: {X.shape}")

100%|██████████| 9983/9983 [03:21<00:00, 49.46it/s]

Runtime: 201.88207817077637 seconds
Shape of X: (9983, 100)





In [54]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(df2['sentiment'])

In [55]:
# Train/Test/Split
from sklearn.model_selection import train_test_split

X_train,X_test, Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

In [56]:
X_train.shape, X_test.shape

((7986, 100), (1997, 100))

In [57]:
# Applying Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, Y_train)

In [58]:
Y_pred = rf.predict(X_test)

In [59]:
# Evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix

print('Accuracy score:',accuracy_score(Y_test, Y_pred))
confusion_matrix(Y_test, Y_pred)

Accuracy score: 0.800200300450676


array([[771, 211],
       [188, 827]])

# NOTE:
- Although the accuracy is not good because we haven't used all the data
- But it is fine to have accuracy of 80%+ with only 10% of the data
- We can also use pre-trained model of the Google
- Do hyper-parameter tunning to get better results

## Practical advice:
- Use Ensemble techniques
- Use Heuristic features(self made features)
- Start with Machine learning then go to Deep learning
- Make sure not working with Imbalanced dataset
- Solve as many project as you can