In [2]:
! pip install opendatasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


# Downloading Dataset
<h3>link for data <a href='https://www.kaggle.com/datasets/bittlingmayer/amazonreviews'>link</a></h3>

In [1]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/bittlingmayer/amazonreviews')

Skipping, found downloaded files in "./amazonreviews" (use force=True to force download)


# importing required libraries

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM , Dense , Embedding
import bz2
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
tqdm.pandas()
stop_words = stopwords.words('english')
lemitizer = WordNetLemmatizer()

In [None]:
# nltk.download()

In [5]:
def get_label_text(file):
    labels , reviews = [] , []
    for line in tqdm(bz2.BZ2File(file)):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        reviews.append(x[10:].strip())
    return labels , reviews



In [6]:
labels,reviews = get_label_text('amazonreviews/train.ft.txt.bz2')


3600000it [01:42, 34985.55it/s]


# only using 50% of data 

In [9]:
size=0.5
cut = int(len(labels)*size)
labels = labels[:cut]
reviews = reviews[:cut]

In [10]:
data = pd.DataFrame.from_dict({'labels':labels,'reviews':reviews})

In [11]:
data.head()

Unnamed: 0,labels,reviews
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...
3,1,Excellent Soundtrack: I truly like this soundt...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


# cleaning data

In [12]:
def clean(text):
  text=text.lower()
  text = re.sub(r'[0-9]+','',text)
  text = re.sub(r'@mention',' ',text)
  text = re.sub(r'https?:\/\/\S+', ' ',text)
  text = re.sub(r"www.\[a-z]?\.?(com)+|[a-z]+\.(com)", ' ',text)
  text = re.sub(r"[_\,\>\(\-:\)\\\/\!\.\^\!\:\];='#]",'',text)
  text = list(nltk.word_tokenize(text))
  text = [w for w in text if w not in stop_words ]
  text = [lemitizer.lemmatize(w) for w in text]
  text = ' '.join(text)
  return text

In [13]:
data['reviews'] = data['reviews'].progress_apply(remove_special_characters)

100%|██████████| 1800000/1800000 [17:11<00:00, 1744.89it/s]


In [14]:
data.head()

Unnamed: 0,labels,reviews
0,1,stuning even nongamer sound track beautiful pa...
1,1,best soundtrack ever anything im reading lot r...
2,1,amazing soundtrack favorite music time hand in...
3,1,excellent soundtrack truly like soundtrack enj...
4,1,remember pull jaw floor hearing youve played g...


# saving so that if some error occour I dont start over

In [15]:
data.to_csv('50_per_train_cleaned_data.csv')

# Deleting variables to free ram space

In [16]:
del labels , reviews

In [17]:
VOCAB_SIZE = 10000
MAX_SENT_LEN = 200

In [18]:
one_hot_reviews = [one_hot(word , VOCAB_SIZE) for word in data['reviews']]

In [19]:
paded_reviews = pad_sequences(one_hot_reviews,padding='pre',maxlen=MAX_SENT_LEN)

In [20]:
del one_hot_reviews

In [21]:
EMBEDDING_VECTOR_FEATURES = 100

# make Super duper simple model

In [22]:
model = Sequential([
    Embedding(VOCAB_SIZE,EMBEDDING_VECTOR_FEATURES,input_length =  MAX_SENT_LEN),
    LSTM(100),
    Dense(units=1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [23]:
# model.summary()

In [24]:
x=np.array(paded_reviews)
y=np.array(data['labels']) 
print(x.shape,y.shape)

(1800000, 200) (1800000,)


In [25]:
del paded_reviews , data

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [27]:
del x,y

# weight lifting

In [28]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb0da048730>

# MOMENT OF tRUTH

In [57]:
text = "This computer is amazing despite it has only 26 keys and no mouse at all it tells me to press f and i do it make a strange sound . intresting part is aliens comes that night and give superpower to a mental kid"
text = clean(text)
temp_review = [text]
one_hot_review = [one_hot(word , VOCAB_SIZE) for word in temp_review]
paded_review = pad_sequences(one_hot_review,padding='pre',maxlen=MAX_SENT_LEN) 
res = (model.predict(paded_review) > 0.5).astype("int32")
print(res)
if res==0:
  print('sentiment is negative')
elif res == 1:
  print('sentiment is positive')


[[1]]
sentiment is positive


In [56]:
text = "Product is not working after single use when i throw on my enemy it explod and kills him as expected but it did not work again on his dog"
text = clean(text)
temp_review = [text]
one_hot_review = [one_hot(word , VOCAB_SIZE) for word in temp_review]
paded_review = pad_sequences(one_hot_review,padding='pre',maxlen=MAX_SENT_LEN) 
res = (model.predict(paded_review) > 0.5).astype("int32")
print(res)
if res==0:
  print('sentiment is negative')
elif res == 1:
  print('sentiment is positive')

[[0]]
sentiment is negative
