<a href="https://colab.research.google.com/github/Halix267/Text_classification-with-neutral-networks/blob/master/Amazon_Review_with_Neural_networks(CNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip         # downloading the pretrained embedding matrix of about 6 billion vocablary
!unzip '/content/gdrive/My Drive/Amazon review/glove.6B.zip.1'   # unzipping the zip file

In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
# Reading the dataset
dataset=pd.read_json('/content/gdrive/My Drive/Amazon review/Amazon_Instant_Video_5.json',lines=True)
Final_data=pd.DataFrame(dataset,columns=['summary','overall'])

In [None]:
#Splitting the dataset into training and text set
data=Final_data['summary'].values      #Data

#Mapping the reviews into positive,negative and neutral
sentiment = []
for i,row in Final_data.iterrows():
    if row['overall'] >3:
        sentiment.append("positive")
    elif row['overall'] <3:
        sentiment.append("negative")
    else:
        sentiment.append("neutral")
Final_data['sentiment']=sentiment
Final_data['sentiment']= Final_data['sentiment'].map({'negative':0,'positive':2,'neutral':1})
Final_data.drop('overall',axis=1)

label=Final_data['sentiment'].values    #Targets




In [None]:
from keras.preprocessing.text import Tokenizer

X_train,X_test,Y_train,Y_test=train_test_split(data,label,test_size=0.10,random_state=42)

tokenize=Tokenizer(num_words=1000)
tokenize.fit_on_texts(X_train)

X_train=tokenize.texts_to_matrix(X_train)
X_test=tokenize.texts_to_matrix(X_test)

vocab_size=len(tokenize.word_index) + 1

#equalising the size of each sentence

from keras.preprocessing.sequence import pad_sequences

max_len=10

X_train=pad_sequences(X_train,padding='post',maxlen=max_len)
X_test=pad_sequences(X_test,padding='post',maxlen=max_len)
Y_train=tf.one_hot(Y_train,depth=3)
Y_test=tf.one_hot(Y_test,depth=3)

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_dim=10
embedding_matrix = create_embedding_matrix(
     '/content/gdrive/My Drive/Amazon review/glove.6B.100d.txt',
     tokenize.word_index, embedding_dim)

In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Flatten,MaxPooling1D,Conv1D,Dropout

#model

model = Sequential([
                    Embedding(input_dim=vocab_size,
                              output_dim=embedding_dim,
                              weights=[embedding_matrix],
                              input_length=max_len,
                              trainable=False),
                    Dropout(0.5),
                    Conv1D(16,kernel_size=3,activation='relu',padding='valid'),
                    MaxPooling1D(),
                    Conv1D(16,kernel_size=3,activation='relu',padding='valid'),
                    MaxPooling1D(),
                    
                    
                    Flatten(),
                    Dense(256,activation='relu'),
                    Dropout(0.5),
                    Dense(3,activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 10, 10)            113190    
_________________________________________________________________
dropout_16 (Dropout)         (None, 10, 10)            0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 8, 16)             496       
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 4, 16)             0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 2, 16)             784       
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 1, 16)             0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 16)               

In [None]:
history= model.fit(X_train,Y_train,validation_data=(X_test,Y_test),batch_size=10,epochs=50,verbose=2
                  )

Epoch 1/50
3342/3342 - 15s - loss: 0.6711 - accuracy: 0.7887 - val_loss: 0.6374 - val_accuracy: 0.8002
Epoch 2/50
3342/3342 - 15s - loss: 0.6656 - accuracy: 0.7891 - val_loss: 0.6406 - val_accuracy: 0.8002
Epoch 3/50
3342/3342 - 14s - loss: 0.6648 - accuracy: 0.7890 - val_loss: 0.6384 - val_accuracy: 0.8002
Epoch 4/50
3342/3342 - 14s - loss: 0.6643 - accuracy: 0.7891 - val_loss: 0.6369 - val_accuracy: 0.8002
Epoch 5/50
3342/3342 - 14s - loss: 0.6644 - accuracy: 0.7891 - val_loss: 0.6374 - val_accuracy: 0.8002
Epoch 6/50
3342/3342 - 14s - loss: 0.6635 - accuracy: 0.7891 - val_loss: 0.6388 - val_accuracy: 0.8002
Epoch 7/50
3342/3342 - 14s - loss: 0.6635 - accuracy: 0.7890 - val_loss: 0.6389 - val_accuracy: 0.8002
Epoch 8/50
3342/3342 - 14s - loss: 0.6631 - accuracy: 0.7891 - val_loss: 0.6391 - val_accuracy: 0.8002
Epoch 9/50
3342/3342 - 14s - loss: 0.6629 - accuracy: 0.7890 - val_loss: 0.6373 - val_accuracy: 0.8002
Epoch 10/50
3342/3342 - 14s - loss: 0.6630 - accuracy: 0.7891 - val_loss:

In [None]:
#plotting the curve for loss
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss vs. epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()

#plotting the curve for accuracy

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy vs. epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()
