In [2]:
import os
import numpy as np
import pandas as pd
from tensorflow import keras
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, Dense, GlobalAveragePooling1D, Bidirectional, LSTM, Dropout
from keras.preprocessing.text import Tokenizer
import pickle
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MANYA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MANYA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
VOCAB_SIZE = 10000
MAX_LEN = 250
EMBEDDING_DIM = 100
MODEL_PATH = "app\sentiment_analysis_model.h5"
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

file_path = "data.csv"
df = pd.read_csv(file_path, encoding = 'ISO-8859-1')
df_shuffled = df.sample(frac=1).reset_index(drop=True)

In [36]:
texts = []
labels = []

print(df_shuffled.head())

   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY _TheSpecialOne_  \
0  4  1694014092  Mon May 04 00:42:00 PDT 2009  NO_QUERY        oliyoung   
1  0  2054972242  Sat Jun 06 08:27:02 PDT 2009  NO_QUERY        syndiloo   
2  0  2252784977  Sat Jun 20 06:59:42 PDT 2009  NO_QUERY         Senfaye   
3  0  2015069307  Wed Jun 03 03:36:16 PDT 2009  NO_QUERY       Emily1384   
4  4  1556529370  Sat Apr 18 23:12:07 PDT 2009  NO_QUERY    olivertobias   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  @Mattro heh, i meant differences between b3 an...                                                                   
1   studying! i know secretly thats what everyone...                                                                   
2  WOW, only 1 minute to go and @emmynoodle is no...                                                                   
3  @TashaLxo I have one as well. It's reeeaaally ...           

In [37]:
stemmer = SnowballStemmer("english")
def preprocess_text(text):

    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()


    # Tokenize the text into words
    words = nltk.word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stem the words
    words = [stemmer.stem(word) for word in words]

    # Join the words back into a single string
    text = ' '.join(words)

    return text

for index, row in df_shuffled.iterrows():
    text = preprocess_text(row.iloc[-1])
    texts.append(text)
    label = row.iloc[0]
    labels.append(0 if label == 0 else 1 if label == 2 else 2)

    
print("done")

KeyboardInterrupt: 

In [7]:
len(labels)

1646475

In [8]:
len(texts)

1646475

In [None]:
print(texts[500])

want write blog think someth good write


In [9]:
texts = np.array(texts)
labels = np.array(labels)

In [19]:
new_df = pd.DataFrame({'texts': texts, 'labels': labels})

In [20]:
new_df

Unnamed: 0,texts,labels
0,h c l long nhong ngo ng tr n 1 ng c n l n nh h...,0
1,pitchi friggin fell asleep,0
2,doesnt want go work,0
3,anyon could ever abandon boston terrier never ...,0
4,dad best let go warp tour citi wish could joke...,0
...,...,...
1646470,headach,0
1646471,watch dolphin ocean,2
1646472,omg hot im melt thought could handl guess im s...,0
1646473,cours mean love youu,2


In [23]:
new_df.to_csv("prepdata.csv")

In [4]:
new_df = pd.read_csv("prepdata.csv")

In [5]:
texts = new_df['texts']
labels = new_df['labels']

In [6]:
texts

0          h c l long nhong ngo ng tr n 1 ng c n l n nh h...
1                                 pitchi friggin fell asleep
2                                        doesnt want go work
3          anyon could ever abandon boston terrier never ...
4          dad best let go warp tour citi wish could joke...
                                 ...                        
1646470                                              headach
1646471                                  watch dolphin ocean
1646472    omg hot im melt thought could handl guess im s...
1646473                                 cours mean love youu
1646474                sound yummi sorri get hang last night
Name: texts, Length: 1646475, dtype: object

In [7]:
labels

0          0
1          0
2          0
3          0
4          0
          ..
1646470    0
1646471    2
1646472    0
1646473    2
1646474    0
Name: labels, Length: 1646475, dtype: int64

In [8]:
texts = list(texts)

In [9]:
texts = [text for text in texts if isinstance(text, str)]

In [10]:
texts = np.array(texts)

In [11]:

# print("Shape of texts:",texts.shape)

#Tokenize the sequences
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

#Padding the sequences
padded_sequences = pad_sequences(sequences, maxlen = MAX_LEN, value=VOCAB_SIZE-1, padding='post')
print(padded_sequences[0])
print("Padded:",len(padded_sequences))

#Save the tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Split data into training and test sets 
train_data = padded_sequences[:-100000]
test_data = padded_sequences[-100000:]
train_labels = labels[:-100000]
test_labels = labels[-100000:]
print("Train",len(train_data))
print("Test:",len(test_data))

[1109  327  517   99 1303 3810  155  120 1303  327  155  517  155 2542
 5406  327  625  517  327 1935 4300 1489  155 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999 9999
 9999 

In [12]:
keras.callbacks.ReduceLROnPlateau

keras.callbacks.ReduceLROnPlateau

In [13]:
from keras.callbacks import ReduceLROnPlateau

In [14]:
if os.path.exists(MODEL_PATH):
    print("Loading saved model...")
    model = load_model(MODEL_PATH)
else:
    print("Training a new model...")
    
    # Create a MirroredStrategy.
    strategy = tf.distribute.MirroredStrategy()
    print("Number of devices: {}".format(strategy.num_replicas_in_sync))
    with strategy.scope():
        # Define the model
        model = Sequential([
            Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
            Bidirectional(LSTM(64, dropout=0.2)),
            Dropout(0.2),
            Dense(24, activation='relu'),
            Dense(3, activation='softmax')  # 3 classes: negative, neutral, positive
        ])

        # Compile the model
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        
        # Define ReduceLROnPlateau callback
        reduce_lr = ReduceLROnPlateau(factor=0.1,
                                        min_lr=0.01,
                                        monitor='val_loss',
                                        verbose=1)

        # Train the model
        model.fit(train_data, train_labels, epochs=10, batch_size=32, validation_split=0.2, callbacks=[reduce_lr])

        # Save the trained model
        model.save(MODEL_PATH)


Training a new model...
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 1/10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
IN

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test accuracy: {accuracy * 100:.2f}%")

In [None]:
# Interactive loop for predictions
def encode_text(text):
    tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [tokenizer.word_index[word] if word in tokenizer.word_index else 0 for word in tokens]
    return pad_sequences([tokens], maxlen=MAX_LEN, padding='post', value=VOCAB_SIZE-1)

In [None]:
while True:
    user_input = input("Enter a sentence for sentiment analysis (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    
    encoded_input = encode_text(user_input)
    prediction = np.argmax(model.predict(encoded_input))

    if prediction == 0:
        print("Sentiment: Negative")
    elif prediction == 1:
        print("Sentiment: Neutral")
    else:
        print("Sentiment: Positive")