In [2]:


import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
import string
# Any results you write to the current directory are saved as output.

In [None]:
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

Getting The Data From The Sources

In [3]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('test.ft.txt.bz2')

In [7]:
train_df=pd.DataFrame(list(zip(train_labels, train_texts)),columns =['Labels', 'Text'])
train_df

Unnamed: 0,Labels,Text
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...
3,1,Excellent Soundtrack: I truly like this soundt...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
3599995,0,Don't do it!!: The high chair looks great when...
3599996,0,"Looks nice, low functionality: I have used thi..."
3599997,0,"compact, but hard to clean: We have a small ho..."
3599998,0,what is it saying?: not sure what this book is...


In [8]:
test_df=pd.DataFrame(list(zip(test_labels, test_texts)),columns =['Labels', 'Text'])
test_df

Unnamed: 0,Labels,Text
0,1,Great CD: My lovely Pat has one of the GREAT v...
1,1,One of the best game music soundtracks - for a...
2,0,Batteries died within a year ...: I bought thi...
3,1,"works fine, but Maha Energy is better: Check o..."
4,1,Great for the non-audiophile: Reviewed quite a...
...,...,...
399995,0,Unbelievable- In a Bad Way: We bought this Tho...
399996,0,"Almost Great, Until it Broke...: My son reciev..."
399997,0,Disappointed !!!: I bought this toy for my son...
399998,1,Classic Jessica Mitford: This is a compilation...


In [11]:
pos = []
neg = []
for l in train_df.Labels:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)

In [12]:
train_df['Pos']= pos
train_df['Neg']= neg

In [13]:
train_df.head()

Unnamed: 0,Labels,Text,Pos,Neg
0,1,Stuning even for the non-gamer: This sound tra...,1,0
1,1,The best soundtrack ever to anything.: I'm rea...,1,0
2,1,Amazing!: This soundtrack is my favorite music...,1,0
3,1,Excellent Soundtrack: I truly like this soundt...,1,0
4,1,"Remember, Pull Your Jaw Off The Floor After He...",1,0


Cleaning(1)

In [16]:
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

train_df['Text_Clean'] = train_df['Text'].apply(lambda x: remove_punct(x))

Cleaning The Text Data

In [17]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

Splitting The Data Into Train And Validation Sets

In [18]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, random_state=57643892, test_size=0.15)

tokenising The top 15000 words using keras tokeniser
We fit it on the train set and apply to the other sets

In [19]:
MAX_FEATURES = 15000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

we'll find the largest text and pad all the texts to that maximum length in order to make the dataset uniform

In [20]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

From tensorflow.keras a model has been built, a convulational network.
There are many layers included in this network.
The model summary given below details the network.

In [21]:
def build_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 128)(sequences)
    x = layers.Conv1D(128, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(128, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(128, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(128, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [22]:
model.fit(
    train_texts, 
    train_labels, 
    batch_size=256,
    epochs=4,
    validation_data=(val_texts, val_labels), )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x22c4586a548>

In [23]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 255)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 255, 128)          1920000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 253, 128)          49280     
_________________________________________________________________
batch_normalization_1 (Batch (None, 253, 128)          512       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 84, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 80, 128)           82048     
_________________________________________________________________
batch_normalization_2 (Batch (None, 80, 128)           512   

In [24]:
preds = model.predict(test_texts)
print('Accuracy on the test set: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score on the test set: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

Accuracy on the test set: 0.9531
F1 score on the test set: 0.953
ROC AUC score: 0.9888


In [52]:
1 * (preds > 0.5)
test_labels

array([1, 1, 0, ..., 0, 1, 0])

In [28]:
test_texts

array([[   0,    0,    0, ...,   14,   12,  909],
       [   0,    0,    0, ...,  369,  163,    6],
       [   0,    0,    0, ...,   93, 3787,  437],
       ...,
       [   0,    0,    0, ..., 1593,   11,  442],
       [   0,    0,    0, ..., 3073,    5,   52],
       [   0,    0,    0, ...,    5,  203, 1331]])

In [31]:
new_df=pd.read_csv('amazon.csv')

In [32]:
new_df

Unnamed: 0.1,Unnamed: 0,tweet_id,text,Sentiment
0,0,1.270000e+18,"Yep, @amazon real easy to cancel now. I suppor...",-1.0
1,1,1.270000e+18,@amazon disgusting. Real classy Amazon. @nyp...,-1.0
2,2,1.270000e+18,Check out this Amazon deal: #SamsungGalaxy S10...,0.0
3,3,1.270000e+18,RT @KennethCFilson: A Ripple In The Darkness b...,0.0
4,4,1.270000e+18,@midgets_levil @krokdrib @Jaxx702 @Marir65Ruiz...,0.0
...,...,...,...,...
3304,3327,1.270000e+18,RT @CaveManMike1: We need All Police Supporter...,-1.0
3305,3328,1.270000e+18,@chessiblogs @kanwaldeep_k You just have to re...,1.0
3306,3329,1.270000e+18,RT @Dreamwale90: We are currently on number 2 ...,1.0
3307,3330,1.270000e+18,RT @mkjmc: Win a $500 @Amazon gift card from @...,1.0


In [33]:
new_texts = normalize_texts(new_df['text'])


In [34]:
new_texts

['yep   amazon real easy to cancel now  i support all people  not blm rioters   hope all will cancel like i did ',
 ' amazon disgusting  real classy amazon     nypost  njspba https   t co tpgrnonucq',
 'check out this amazon deal   samsunggalaxy s10 lite new unlocked android cell phone   1gb of storage   gsm  amp  cdma   https   t co g0ltexoer',
 'rt  kennethcfilson  a ripple in the darkness by kenneth c  filson   kindleunlimited  horror  revenge   camping      witch            ghost    love    ',
 ' midgetslevil  krokdrib  jaxx0  marirruiz  gregmusselwhit  usmcmil0  dallasneedslung  amazon  pepsi   https   t co atchvlalhu',
 '  gt  ibm will no longer offer  develop  or research  facialrecognition technology  arvindkrishna  verge  nist  amazon   https   t co nzxspqdm',
 ' flipkartsupport on amazon i am also using fun zone  daily quiz   karigar   saheli   and  fortunewheel   occassion   https   t co kpxhknsnn',
 'rt  lindsayromantic   romance  paperback  feelgoodfiction a summer bewitch

In [35]:
new_texts = tokenizer.texts_to_sequences(new_texts)

In [36]:
new_texts = pad_sequences(new_texts, maxlen=MAX_LENGTH)

In [37]:
new_df['Sent2']=(new_df['Sentiment']+1)/2

In [38]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,Sentiment,Sent2
0,0,1.27e+18,"Yep, @amazon real easy to cancel now. I suppor...",-1.0,0.0
1,1,1.27e+18,@amazon disgusting. Real classy Amazon. @nyp...,-1.0,0.0
2,2,1.27e+18,Check out this Amazon deal: #SamsungGalaxy S10...,0.0,0.5
3,3,1.27e+18,RT @KennethCFilson: A Ripple In The Darkness b...,0.0,0.5
4,4,1.27e+18,@midgets_levil @krokdrib @Jaxx702 @Marir65Ruiz...,0.0,0.5


In [88]:
preds_new = model.predict(new_texts)

In [89]:
preds_new[:10]

array([[0.52317655],
       [0.2707116 ],
       [0.39963853],
       [0.61424214],
       [0.5699257 ],
       [0.2879994 ],
       [0.84445506],
       [0.59338176],
       [0.11698467],
       [0.5699257 ]], dtype=float32)

In [48]:
preds2=preds_new

In [49]:
for i in range(len(preds2)):
    if preds2[i]>0.3 and preds2[i]<0.7:
        preds2[i]=0.5
    elif preds2[i]<=0.3:
        preds2[i]=0.0
    elif preds2[i]>=0.7:
        preds2[i]=1.0

In [58]:
arr_sent=new_df.Sent2.to_numpy()

In [85]:
np.shape(arr_sent)
#type(preds2)np.shape(arr_sent)
#np.shape(preds2)


(3309,)

In [78]:
np.reshape(preds2,np.shape(arr_sent))

array([0.5, 0. , 0.5, ..., 0.5, 0.5, 0.5], dtype=float32)

In [86]:
accuracy_score(arr_sent, 1*preds2)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [82]:

print('Accuracy on the test set: {:0.4}'.format(accuracy_score(arr_sent, preds2)))
#print('F1 score on the test set: {:0.4}'.format(f1_score(new_df['Sent2'], preds)))
#print('ROC AUC score: {:0.4}'.format(roc_auc_score(new_df['Sent2'], preds)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').