# Project Requirements

- **Imports**

In [1]:
import re
import nltk
import numpy as np
import pandas as pd 
from warnings import filterwarnings
from IPython.display import clear_output
filterwarnings("ignore")

# Read Data

In [2]:
df = pd.read_csv("data.csv")

# Data Preprocessing

### 1) Text processing:

- **Text Cleaning**
    - Use `re` package to remove hyperlinks, twitter marks and styles

In [5]:
def clean(tweet):
    tweet=re.sub('(#|@)\w*',"",tweet)# \w [a-z|A_Z|0-9|_]
    tweet=re.sub("https?:\/\/\S+","",tweet)
    tweet=re.sub("(\?|!)+"," ",tweet)
    tweet=re.sub("\s\d+\s","",tweet)
    tweet=re.sub("(\.|\,)+","",tweet)
    tweet=re.sub("^\s+","",tweet)
    tweet=re.sub("\s+$","",tweet)
    return tweet

In [6]:
def text_processing(tweets):
    return [clean(tweet) for tweet in tweets]

- **Apply Text preprocessing**

In [8]:
tweets = df.Tweet
tweets = text_processing(tweets)

In [11]:
texts = tweets
labels = df.rate.values

# Tokenize the texts

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

- **Build the tokenizer**

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

- **Tokenize:** to Convert text to sequences

In [25]:
sequences = tokenizer.texts_to_sequences(texts)

In [26]:
np.shape(sequences)

(10000,)

# Pad sequences to have the same length

In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [29]:
padded_sequences = pad_sequences(sequences, maxlen=None) # maxlen --> Optional Int, maximum length of all sequences. If not provided, sequences will be padded to the length of the longest individual sequence.

In [30]:
padded_sequences.shape

(10000, 31)

# Model

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [33]:
embedding_dim = 16
vocab_size = len(tokenizer.word_index) + 1
# print(tokenizer.word_index)

In [34]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=padded_sequences.shape[1]))
model.add(SimpleRNN(64))
model.add(Dense(1, activation='sigmoid'))

In [35]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(padded_sequences, labels, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10

# Prediction

In [77]:
X = ["I loved it!"]

In [78]:
new_sequences = tokenizer.texts_to_sequences(X)
new_padded_sequences = pad_sequences(new_sequences, maxlen=padded_sequences.shape[1])

In [79]:
predictions = model.predict(new_padded_sequences)



In [80]:
"Positive" if predictions[0, 0] > .5 else "Negative"

'Negative'

In [81]:
predictions

array([[0.4579138]], dtype=float32)