<a href="https://colab.research.google.com/github/IT21174230/ML-Journey/blob/main/CNN_for_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

dataset='/content/drive/MyDrive/AmazonReview.csv'
data=pd.read_csv(dataset)

data.head()

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,1
1,This case takes so long to ship and it's not e...,1
2,Good for not droids. Not good for iPhones. You...,1
3,The cable was not compatible between my macboo...,1
4,The case is nice but did not have a glow light...,1


In [2]:
data.dropna(inplace=True)
data = data.dropna(subset=['Review'])

In [3]:

data = data.sample(frac = 1)

In [4]:
reviews=[]

import re

for i in data['Review']:
  j=re.sub(r'[^a-zA-Z0-9\s]', '', i)
  reviews.append(j.lower())

print(reviews[10:15])

['this is not worth it  your kindle doesnt glare until you put this product on than you cant read  save your money', 'came apart on a hot daylong trip  otherwise it was good while it lasted', 'has held up nice does not fit as well as i think it should it is so slim you cant fit the cord into it but i might be the only person that wants both to fit into the case', 'at half the price of the original case i wasnt expecting much  no one was more surprised when the signature leather folio case arrived  comparing it to the original highdollar cover the signature version was more attractive and had the added bonus of an inside pocket  the supple material makes using the kindle just that much more enjoyable  purchased as just an extra cover it  immediately became the primary  a really good bargain', 'i thought thought it said would fit a simple touch nook but when i received it it wasnt the correct size we made the sticker fit by cutting it to size but book case was too big it fits my nook but

In [5]:
from keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews)

review_embed=tokenizer.texts_to_sequences(reviews)


In [6]:
reviews_lens=[]
for i in review_embed:
  reviews_lens.append(len(i))

maxl=max(reviews_lens)
print(maxl)

2741


In [7]:
from keras.preprocessing.sequence import pad_sequences

X=pad_sequences(review_embed, padding='post', maxlen=maxl)

In [8]:
y=data['Sentiment']

from keras.utils import to_categorical

y=y-1
y= to_categorical(y, 5)
print(y)

[[0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.25, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)



(18749, 2741) (18749, 5) (6250, 2741) (6250, 5)


In [10]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping


def get_model():

  model=Sequential([
      Embedding(input_dim=5000, output_dim=100, input_length=maxl),
      Conv1D(16, 5, activation='relu'),
      GlobalMaxPooling1D(),
      Dense(32, activation='relu'),
      Dropout(0.4),
      Dense(5, activation='softmax')
  ])

  model.compile(
        Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

  early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

  return model, early_stopping


In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
import os

folds=5
cv_acc=[]
trained_models=[]
kf=StratifiedKFold(n_splits=folds, shuffle=False)

for i,(train_index, val_index) in enumerate(kf.split(X_train, y_train.argmax(1))):
  print(train_index)
  X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
  y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

  model, early_stop=get_model()

  model.fit(X_train_fold, y_train_fold, batch_size=32, epochs=10, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stop])
  y_val_pred = model.predict(X_val_fold)
  accuracy = accuracy_score(np.argmax(y_val_fold, axis=1), np.argmax(y_val_pred, axis=1))
  cv_acc.append(accuracy)

  model_filename = f'model_fold_{i+1}.h5'
  model.save(model_filename)
  trained_models.append(model_filename)


[ 3712  3715  3723 ... 18746 18747 18748]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


  saving_api.save_model(


[    0     1     2 ... 18746 18747 18748]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


  saving_api.save_model(


[    0     1     2 ... 18746 18747 18748]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
[    0     1     2 ... 18746 18747 18748]


  saving_api.save_model(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


  saving_api.save_model(


[    0     1     2 ... 15078 15083 15094]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


  saving_api.save_model(


In [14]:
print(cv_acc)

[0.4776, 0.49546666666666667, 0.4856, 0.4650666666666667, 0.48893038143504935]


In [15]:
from keras.models import load_model

print(f'average validation accuracy score:{np.mean(cv_acc)}')

most_acc=trained_models[np.argmax(cv_acc)]
most_acc_model=load_model(most_acc)

average validation accuracy score:0.4825327429536766


In [16]:


y_pred=most_acc_model.predict(X_test)

test_labels=np.argmax(y_test, axis=1) + 1
pred_labels=np.argmax(y_pred, axis=1) + 1

print(test_labels)
print(pred_labels)

accuracy=accuracy_score(test_labels, pred_labels)

print(f'testing accuracy percentage = {accuracy}')


[5 2 3 ... 1 1 1]
[5 1 3 ... 1 1 3]
testing accuracy percentage = 0.47872
