<a href="https://colab.research.google.com/github/MANOJ-S-NEGI/SENTIMENT_ANALYSIS_NLP/blob/main/sentiments_analysis_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, nltk
import seaborn as sns
import string
nltk.download('punkt')
import tensorflow as tf

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df= pd.read_csv("/content/sentiment_analysis_clean_lemitized_data.csv")

In [3]:
df.sample(3)

Unnamed: 0,target,reviews
5217,positive,southwestair redcarpet southwest companion pas...
1142,negative,southwestair cancel flight flight minutes take...
5981,positive,usairways nick flight awesome please reward wa...


In [4]:
# seperating feature and target column:
x = df['reviews']

# Create a mapping dictionary
sentiment_mapping = {'negative': 0, 'positive': 1, 'neutral': 2}
# Apply the mapping using the 'map' function
y = df['target'].map(sentiment_mapping)

In [5]:
x_token_frame = x.copy()

In [6]:
counter = 0
for text in x:
    split_word = []
    for split in text.split():
        split_word.append(split)
    # Join and tokenize
    tokenize_sentence = nltk.word_tokenize(' '.join(split_word))
    x_token_frame[counter] = tokenize_sentence.copy()
    counter = counter+1

In [7]:
x_token_frame.sample()

2878    [jetblue, start, xweekly, embraersa, e, flight...
Name: reviews, dtype: object

In [8]:
### integer encoding using keras:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token = '')
tokenizer.fit_on_texts(x_token_frame)

In [9]:
## performing the split:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_token_frame, y, test_size=0.10)

print(f"x_train shape {x_train.shape} x_train_length: {len(x_train)}")
print(f"y_train shape {y_train.shape} y_train_length: {len(y_train)}")

print(f"x_test shape {x_test.shape} x_test_length: {len(x_test)}")
print(f"y_test shape {y_test.shape} y_test_length: {len(y_test)}")

x_train shape (5400,) x_train_length: 5400
y_train shape (5400,) y_train_length: 5400
x_test shape (600,) x_test_length: 600
y_test shape (600,) y_test_length: 600


In [10]:
x_train[0]

['usairways',
 'call',
 'time',
 'redeem',
 'mile',
 'point',
 'cant',
 'get',
 'advertise',
 'miles',
 'make',
 'hard',
 'use']

In [11]:
# searching max length and vocab size

# findind input dim:
vocab_size = len(tokenizer.word_index)
vocab_size = vocab_size+1
print(f"Vocabulary size: {vocab_size}")




# searching max length:
max_seq_length = 0
for sentence in x_train:
    current_length = len(sentence)
    if current_length > max_seq_length:
        max_seq_length = current_length

print("max_seq_length", max_seq_length)

Vocabulary size: 7196
max_seq_length 22


In [12]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [13]:
## paddding the each rows :
from keras.utils import pad_sequences

x_train = pad_sequences(x_train, maxlen=max_seq_length, padding='post')
x_test = pad_sequences(x_test, maxlen=max_seq_length, padding='post')
print(f"x_train shape {x_train.shape}\n x_train_length: {len(x_train)}")
print(f"y_train shape {y_train.shape}\n y_train_length: {len(y_train)}")


x_train shape (5400, 22)
 x_train_length: 5400
y_train shape (5400,)
 y_train_length: 5400


In [14]:
x_train[:2]

array([[   5,  176,  615,  147, 3945,  460,  229,   81,   46,   19,  132,
        2389,    8,    2, 2390,    0,    0,    0,    0,    0,    0,    0],
       [   4,   24,   62, 5767,   30,  984, 5768, 1217, 1217,  125, 5769,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [15]:
## converting into numpy:

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)


In [24]:
## creating sequential model layers:
model_sentiments = tf.keras.Sequential([

    tf.keras.layers.Embedding(input_dim = 7196 , output_dim=3, input_length = max_seq_length ),
    tf.keras.layers.LSTM(100,dropout=0.2, activation='relu', return_sequences = True),  # Added return_sequences=True
     tf.keras.layers.SpatialDropout1D(rate=0.3),
    tf.keras.layers.LSTM(100, dropout=0.2, activation='relu', return_sequences = False),  # Added return_sequences=True

    tf.keras.layers.Dense(3, activation='softmax')

])


## model summary:
model_sentiments.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 22, 3)             21588     
                                                                 
 lstm_7 (LSTM)               (None, 22, 100)           41600     
                                                                 
 spatial_dropout1d_6 (Spati  (None, 22, 100)           0         
 alDropout1D)                                                    
                                                                 
 lstm_8 (LSTM)               (None, 100)               80400     
                                                                 
 dense_3 (Dense)             (None, 3)                 303       
                                                                 
Total params: 143891 (562.07 KB)
Trainable params: 143891 (562.07 KB)
Non-trainable params: 0 (0.00 Byte)
______________

In [25]:
# Create a function to implement a Earlystop callback with loss monitor)
Early_stop = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience = 3, verbose = 2)

## model compiling :
optimize = tf.keras.optimizers.experimental.Adam(learning_rate=0.0001)
model_sentiments.compile(optimizer= optimize, loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [26]:
# fitting the model:
history = model_sentiments.fit(x_train,y_train,
                  validation_data=(x_test, y_test),
                  batch_size=32,
                  epochs=30,
                  callbacks=Early_stop)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 17: early stopping


In [32]:
prediction = model_sentiments.predict(x_test)



In [37]:
y_pred = []

for i in prediction:
    # Convert the probabilities to a NumPy array
    probabilities = np.array(i)
    # Get the index of the class with the highest probability
    predicted_class = np.argmax(probabilities)
    y_pred.append(predicted_class)


In [42]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {np.round(mae,2)}")


Mean Absolute Error (MAE): 0.54


In [44]:
result = []
for i in range(len(y_test)):
    if y_pred[i] == y_test[i]:
        result.append('correct prediction')
    else:
        result.append('wrong prediction')


test_table =  pd.DataFrame({'actual_sentiments': y_test, 'prediction_review': y_pred, 'result': result})


In [46]:
test_table.sample(50)

Unnamed: 0,actual_sentiments,prediction_review,result
201,0,2,wrong prediction
150,0,0,correct prediction
256,1,1,correct prediction
98,2,1,wrong prediction
183,2,0,wrong prediction
469,0,0,correct prediction
106,2,0,wrong prediction
348,0,0,correct prediction
97,1,1,correct prediction
298,1,2,wrong prediction


In [59]:
## checking predtion:

comment = "the airpalne got crack on the WINDOW seat are not comfortable"

In [60]:
## from nltk import stopwords
from nltk.corpus import stopwords

## from nltk importing WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

## downloading pre define list, we can make customized one
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Get the list of unique English stopwords
stop_words_eng = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [93]:
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

def sentiment_of_comment(text):
    try:
        tweet = []
        text = text.lower()
        split_text = word_tokenize(text)  # Use nltk's word_tokenize for tokenization

        for j in split_text:
            if j not in stop_words_eng:
                base_form = lemmatizer.lemmatize(j, pos='v')
                tweet.append(base_form)
            else:
                pass

        clean_text = ' '.join(tweet)
        token_word = tokenizer.texts_to_sequences([clean_text])  # Tokenize the clean text
        token_word = pad_sequences(token_word, maxlen=max_seq_length, padding='post')  # Pad the sequences

        prediction = model_sentiments.predict(token_word)
        pred_class = np.argmax(prediction)
        comment_list = ['negative', 'positive', 'neutral']

        return comment_list[pred_class]


    except Exception as e:
        raise Exception(e)



In [99]:
# Test the process function
comment = "while flying passangers situation become worse when one engine failed"
sentiment = sentiment_of_comment(comment)
print(sentiment)

negative


---
---