In [1]:
import tensorflow
import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation,Dropout,SpatialDropout1D,Bidirectional,LSTM,SimpleRNN
from tensorflow.keras.layers import Conv1D,MaxPooling1D,GlobalAveragePooling1D,GlobalMaxPooling1D

import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer

In [2]:

train_file_path = 'C:/Users/gouth/Downloads/train_150k.txt'
test_file_path = 'C:/Users/gouth/Downloads/test_62k.txt'

with open(train_file_path, 'r', encoding='utf-8') as file:
    data = file.readlines()

with open(test_file_path, 'r', encoding='utf-8') as file:
    test_data = file.readlines()


# Splitting the data into 'sentiment' and 'text' columns
data_split = [line.strip().split('\t') for line in data]

# Creating a DataFrame
df = pd.DataFrame(data_split, columns=['sentiment', 'text'])

# Splitting the data into 'sentiment' and 'text' columns
data_split = [line.strip().split('\t') for line in test_data]

# Creating a DataFrame
test_df = pd.DataFrame(data_split, columns=['sentiment', 'text'])

# Saving the DataFrame to a CSV file
df.head()

Unnamed: 0,sentiment,text
0,0,Starting back at work today Looks like it'l...
1,1,Sugar levels dropping... munchies setting in. ...
2,1,@karineb22 yeah!!! have a great summer break!
3,1,hannah montana was very good. now going to re...
4,1,"@Mayra326 aww, have fun! I just had my 3D las..."


In [3]:
mapping = {'1': 1, '0': 0}

df['sentiment'] = df['sentiment'].map(mapping)

df.head()



Unnamed: 0,sentiment,text
0,0,Starting back at work today Looks like it'l...
1,1,Sugar levels dropping... munchies setting in. ...
2,1,@karineb22 yeah!!! have a great summer break!
3,1,hannah montana was very good. now going to re...
4,1,"@Mayra326 aww, have fun! I just had my 3D las..."


In [4]:
test_df['sentiment'] = test_df['sentiment'].map(mapping)

df.head()


Unnamed: 0,sentiment,text
0,0,Starting back at work today Looks like it'l...
1,1,Sugar levels dropping... munchies setting in. ...
2,1,@karineb22 yeah!!! have a great summer break!
3,1,hannah montana was very good. now going to re...
4,1,"@Mayra326 aww, have fun! I just had my 3D las..."


In [5]:
df.sentiment.value_counts()


0    75019
1    74966
Name: sentiment, dtype: int64

In [6]:
df.isna().sum()


sentiment    0
text         0
dtype: int64

In [7]:
label = df['sentiment']
label = label.to_numpy()
type(label)


numpy.ndarray

In [8]:
# downloading nltk dependencies
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gouth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gouth\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gouth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
all_stopwords = stopwords.words('english')
negation = ['no','not']
all_stopwords = [w for w in all_stopwords if w not in negation]

def preprocessing(data):
    corpus = []
    for i in range(len(data)):
        # remove urls
        tweet = re.sub(r'http\S+', ' ', data[i]) # links

        # remove html tags
        tweet = re.sub(r'<.*?>', ' ', tweet) # tags        
        tweet = re.sub('&\w+([-.]\w+)*', ' ', tweet) # colorcodes

        # remove digits
        tweet = re.sub(r'\d+', ' ', tweet)
        tweet = re.sub('@\w+([-.]\w+)*', ' ', tweet) # mentions

        # remove emojis
        emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
        tweet = emoji_pattern.sub(r'', tweet) # no emoji

        # remove hashtags
        tweet = re.sub(r'#\w+', ' ', tweet) # hashtags
        review = re.sub('[^a-zA-Z]', ' ', tweet) # any characters other than alphabets
        review = review.lower() # casefolding
        review = review.split() 
        lemma= WordNetLemmatizer() # lemmatization
        review = [lemma.lemmatize(word) for word in review if word not in all_stopwords]
        review = ' '.join(review)
        corpus.append(review)
    return corpus


In [10]:
text = preprocessing(df['text'])
text_test =preprocessing(test_df['text'])

In [11]:
label_test = test_df['sentiment']
label_test = label_test.to_numpy()
type(label_test)


numpy.ndarray

In [12]:
X_train, X_val,y_train,y_val = train_test_split(text,label,test_size=0.20,stratify = label,random_state=42)

In [13]:
X_test=text_test
y_test=label_test

In [15]:
token = Tokenizer()
token.fit_on_texts(X_train)

In [16]:
# Convert text data to sequences of integers
train_sequences = token.texts_to_sequences(X_train)
valid_sequences = token.texts_to_sequences(X_val)
test_sequences = token.texts_to_sequences(X_test)

In [17]:
vocab_size = len(token.word_index)+1
vocab_size


52555

In [18]:
maxlen = 100 # specifies the maximum length of the sequences after padding or truncating.

X_train = pad_sequences(train_sequences, maxlen=maxlen,padding = 'post')
X_val = pad_sequences(valid_sequences, maxlen=maxlen,padding = 'post')
X_test = pad_sequences(test_sequences, maxlen=maxlen,padding = 'post')

In [19]:
X_train.shape


(119988, 100)

In [20]:
from keras.layers import LSTM


In [21]:
# Define the input shape
input_shape = (100,)

# Define the new parameters
vocab_size = 60133  # Adjusted based on the input shape
vec_size = 100  # Adjusted to match the input shape

bi_lstm =Sequential()

# Embedding layer
bi_lstm.add(Embedding(input_dim=vocab_size, output_dim=vec_size, input_length=input_shape[0], trainable=False))

# LSTM
bi_lstm.add(Bidirectional(LSTM(64, dropout=0.4, recurrent_dropout=0.4)))  # Adjusted units for the LSTM

# Output layer
bi_lstm.add(Dense(1, activation='sigmoid'))


In [22]:
bi_lstm.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])


In [23]:
bi_lstm_history = bi_lstm.fit(X_train,y_train,epochs=10,validation_data=(X_val,y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
accuracy = bi_lstm.evaluate(X_test, y_test)
print("Accuracy of the Bi-LSTN on the test set:", accuracy[1])

Accuracy of the Bi-LSTN on the test set: 0.7095713019371033
