In [1]:
import pandas as pd
import numpy as np  
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import wordnet
import os
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     /home/hrithikpaul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hrithikpaul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hrithikpaul/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/hrithikpaul/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/hrithikpaul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#data-preprocessing-
training_data=pd.read_csv("train.csv")
print(training_data.columns)
training_data.dropna(inplace=True)

Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')


In [3]:
import re 
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

# Cleaning the text in the review column
training_data['Cleaned Reviews'] = training_data['selected_text'].apply(clean)
training_data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Cleaned Reviews
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,I d have responded if I were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,Sons of


In [4]:
# Creating the Bag of Words model
word2count = {}
for data in training_data["Cleaned Reviews"]:
	words = nltk.word_tokenize(data)
	for word in words:
		if word not in word2count.keys():
			word2count[word] = 1
		else:
			word2count[word] += 1


In [5]:
review_df = training_data[training_data['sentiment'] != 'neutral']

print(review_df.shape)
review_df.head(5)

(16363, 5)


Unnamed: 0,textID,text,selected_text,sentiment,Cleaned Reviews
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,Sons of
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,fun


In [6]:
sentiment_label = review_df.sentiment.factorize()
sentiment_label

(array([0, 0, 0, ..., 0, 1, 1]),
 Index(['negative', 'positive'], dtype='object'))

In [7]:
tweet = review_df.text.values
tweet
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(tweet,sentiment_label[0],test_size=0.3,random_state=42)

In [8]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(x_train)
encoded_docs = tokenizer.texts_to_sequences(x_train)

2023-01-20 22:13:59.038084: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-20 22:13:59.328662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-20 22:13:59.328704: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-20 22:14:00.637418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [9]:
vocab_size = len( tokenizer.word_index )+1 
vocab_size

15187

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [49]:
from keras.models import Sequential
from keras.backend import clear_session
from keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from keras.layers import Embedding
clear_session()

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
#model.add(Dense(128,activation="tanh"))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           485984    
                                                                 
 spatial_dropout1d (SpatialD  (None, 200, 32)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 50)                16600     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 502,635
Trainable params: 502,635
Non-trainable params: 0
__________________________________________________

In [50]:
history = model.fit(padded_sequence,y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from keras.models import save_model
save_model(model,"lstm_model.h5")

In [38]:
def predict_sentiment(text,model):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    #print("Predicted label: ", sentiment_label[1][prediction])
    #return sentiment_label[1][prediction]
    return prediction

In [14]:
predicted=[]
for i in x_test:
    predicted.append(predict_sentiment(i))
##print(predicted)




In [15]:
print(accuracy_score(y_test,predicted))

0.8698309227948666


In [16]:
import random
import pandas as pd
from sklearn.base import clone
from deap.algorithms import eaSimple
from deap import base, creator, tools
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [17]:
def compute_fitness_score(individual):
    """
    Select the features from the individual, train
    and compute the accuracy_score.
    
    Example:
    individual = [0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1]
    The 1 represents the presence of features and
    0 represents the absence of features
    
    """
    column_support = pd.Series(individual).astype(bool)
   # global x_train, y_train, x_test, y_test, model
    
    #x_train_ = x_train[x_train.columns[column_support]]
    #x_test_ = x_test[x_test.columns[column_support]]

    model.fit(padded_sequence,y_train)
    #y_pred = model.predict(real_data,predicted)
    score = accuracy_score(y_test,predicted)
    
    return (score,)

In [46]:
#model =model
ind_size = model.count_params()


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("weight_bin", random.random)   #Initiate random weights
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.weight_bin, n=ind_size)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.01)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate",compute_fitness_score)


stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("Mean", np.mean)
stats.register("Max", np.max)
stats.register("Min", np.min)


pop = toolbox.population(n=5)   #n = No. of individual in a population
hof = tools.HallOfFame(1)



In [47]:
pop, log = eaSimple(pop,toolbox,cxpb=0.5,mutpb=0.01,ngen=5,halloffame=hof,stats=stats)

gen	nevals	Mean    	Max     	Min     
0  	5     	0.516195	0.516195	0.516195
1  	0     	0.516195	0.516195	0.516195
2  	4     	0.516195	0.516195	0.516195
3  	0     	0.516195	0.516195	0.516195
4  	0     	0.516195	0.516195	0.516195
5  	4     	0.516195	0.516195	0.516195


In [48]:
best_pop = sorted(pop, key=lambda ind: ind.fitness, reverse=True)[0]

In [40]:
k=str(input("enter your value :"))
print(sentiment_label[1][predict_sentiment(k,model)])

positive


In [32]:
import pickle
with open("genetic_model.pkl", "wb") as cp_file:
    pickle.dump(best_pop, cp_file)

In [33]:
best=pickle.load(open("genetic_model.pkl","rb"))


In [34]:
def model_weights_as_vector(model):
    weights_vector = []

    for layer in model.layers: 
        if layer.trainable:
            layer_weights = layer.get_weights()
            for l_weights in layer_weights:
                vector = np.reshape(l_weights, newshape=(l_weights.size))
                weights_vector.extend(vector)

    return np.array(weights_vector)
def model_weights_as_matrix(model, weights_vector):
    weights_matrix = []

    start = 0
    for layer_idx, layer in enumerate(model.layers): 
        layer_weights = layer.get_weights()
        if layer.trainable:
            for l_weights in layer_weights:
                layer_weights_shape = l_weights.shape
                layer_weights_size = l_weights.size
        
                layer_weights_vector = weights_vector[start:start + layer_weights_size]
                layer_weights_matrix = np.reshape(layer_weights_vector, newshape=(layer_weights_shape))
                weights_matrix.append(layer_weights_matrix)
        
                start = start + layer_weights_size
        else:
            for l_weights in layer_weights:
                weights_matrix.append(l_weights)

    return weights_matrix
ind_size = model.count_params()


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

best_weight = model_weights_as_matrix(model, best)

model.set_weights(best_weight)



In [35]:
predicted=[]
for i in x_test:
    predicted.append(predict_sentiment(i))
##print(predicted)
print(accuracy_score(y_test,predicted))


0.5161947443471175


In [None]:
#from keras.models import save_model
#save_model(model,"lstm_model.h5")

In [17]:
test_data=pd.read_csv("test.csv")

test_data.isnull().sum()
test_data['Cleaned Reviews'] = test_data['text'].apply(clean)
test_data.head()

Unnamed: 0,textID,text,sentiment,Cleaned Reviews
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,Last session of the day http twitpic com ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,Shanghai is also really exciting precisely sk...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,Recession hit Veronique Branquinho she has to ...
3,01082688c6,happy bday!,positive,happy bday
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,http twitpic com w p I like it


In [18]:
review_df =test_data[test_data['sentiment'] != 'neutral']

print(review_df.shape)
review_df.head(5)
real_label = review_df.sentiment.factorize()
real_label
real_label = review_df.sentiment.factorize()
real_label
test_data_pred = review_df.text.values
test_data_pred


(2104, 4)


array([' Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China:  (SH)  (BJ).',
       'Recession hit Veronique Branquinho, she has to quit her company, such a shame!',
       ' happy bday!', ...,
       ' I know what you mean. My little dog is sinking into depression... he wants to move someplace tropical',
       '_sutra what is your next youtube video gonna be about? I love your videos!',
       ' http://twitpic.com/4woj2 - omgssh  ang cute ng bby.!'],
      dtype=object)

In [19]:


sentiment_predict=[]
for i in test_data_pred:
  sentiment_predict.append(predict_sentiment(i))



In [20]:
from sklearn.metrics import accuracy_score
predicted=[]
print(sentiment_predict[0])
for i in sentiment_predict:
    print(i)
    if i=="negative":
        predicted.append(0)
    else:
        predicted.append(1)
print(predicted)


positive
positive
negative
positive
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
negative
positive
positive
negative
positive
positive
negative
negative
negative
negative
positive
positive
negative
positive
negative
positive
positive
positive
negative
negative
positive
negative
positive
positive
positive
negative
negative
negative
negative
positive
positive
positive
negative
negative
negative
positive
positive
negative
positive
positive
negative
positive
negative
positive
negative
positive
positive
negative
positive
negative
positive
positive
positive
positive
negative
negative
positive
positive
positive
negative
negative
positive
positive
positive
positive
negative
negative
positive
positive
positive
negative
negative
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
positive
positive
negative
positive
negative
negative
negative
negative
negative
positive
positive
p

In [21]:
real_data=[]
for i in real_label[0]:
    #print(i)
    real_data.append(i)
real_data=real_data[::-1]

In [22]:
print(accuracy_score(real_data,predicted))

0.502851711026616


In [23]:
print(padded_sequence.shape)

(16363, 200)


In [24]:
in_dimen =padded_sequence.shape[1]  #Total no. of observations made about the environment
out_dimen =sentiment_label[0].shape   #Total no. of possible actions. In this case it can take 2 discrete values

In [19]:
import random
import pandas as pd
from sklearn.base import clone
from deap.algorithms import eaSimple
from deap import base, creator, tools
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [20]:
def compute_fitness_score(individual):
    """
    Select the features from the individual, train
    and compute the accuracy_score.
    
    Example:
    individual = [0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1]
    The 1 represents the presence of features and
    0 represents the absence of features
    
    """
    column_support = pd.Series(individual).astype(bool)
   # global x_train, y_train, x_test, y_test, model
    
    #x_train_ = x_train[x_train.columns[column_support]]
    #x_test_ = x_test[x_test.columns[column_support]]

    model.fit(padded_sequence,sentiment_label[0])
    #y_pred = model.predict(real_data,predicted)
    score = accuracy_score(real_data,predicted)
    
    return (score,)

In [27]:
#model =model
ind_size = model.count_params()


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("weight_bin", random.random)   #Initiate random weights
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.weight_bin, n=ind_size)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.01)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate",compute_fitness_score)


stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("Mean", np.mean)
stats.register("Max", np.max)
stats.register("Min", np.min)


pop = toolbox.population(n=2)   #n = No. of individual in a population
hof = tools.HallOfFame(1)

In [28]:
pop, log = eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.01, ngen=10, halloffame=hof, stats=stats)

gen	nevals	Mean    	Max     	Min     
0  	2     	0.502852	0.502852	0.502852
1  	2     	0.502852	0.502852	0.502852
2  	2     	0.502852	0.502852	0.502852
3  	2     	0.502852	0.502852	0.502852
4  	2     	0.502852	0.502852	0.502852
5  	2     	0.502852	0.502852	0.502852
6  	0     	0.502852	0.502852	0.502852
7  	2     	0.502852	0.502852	0.502852
8  	0     	0.502852	0.502852	0.502852
9  	2     	0.502852	0.502852	0.502852
10 	0     	0.502852	0.502852	0.502852


In [30]:
best_pop = sorted(pop, key=lambda ind: ind.fitness, reverse=True)[0]

In [44]:
import pickle
with open("gen_model.pkl", "wb") as cp_file:
    pickle.dump(hof.items[0], cp_file)

In [32]:
import pickle
with open("model1.pkl", "wb") as cp_file:
    pickle.dump(best_pop, cp_file)

In [60]:
best=pickle.load(open("gen_model.pkl","rb"))


In [61]:
def model_weights_as_vector(model):
    weights_vector = []

    for layer in model.layers: 
        if layer.trainable:
            layer_weights = layer.get_weights()
            for l_weights in layer_weights:
                vector = np.reshape(l_weights, newshape=(l_weights.size))
                weights_vector.extend(vector)

    return np.array(weights_vector)
def model_weights_as_matrix(model, weights_vector):
    weights_matrix = []

    start = 0
    for layer_idx, layer in enumerate(model.layers): 
        layer_weights = layer.get_weights()
        if layer.trainable:
            for l_weights in layer_weights:
                layer_weights_shape = l_weights.shape
                layer_weights_size = l_weights.size
        
                layer_weights_vector = weights_vector[start:start + layer_weights_size]
                layer_weights_matrix = np.reshape(layer_weights_vector, newshape=(layer_weights_shape))
                weights_matrix.append(layer_weights_matrix)
        
                start = start + layer_weights_size
        else:
            for l_weights in layer_weights:
                weights_matrix.append(l_weights)

    return weights_matrix
ind_size = model.count_params()


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

best_weight = model_weights_as_matrix(model, best)

model.set_weights(best_weight)



In [41]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    #print("Predicted label: ", sentiment_label[1][prediction])
    return sentiment_label[1][prediction]

sentiment_predict=[]
for i in test_data_pred:
  sentiment_predict.append(predict_sentiment(i))
from sklearn.metrics import accuracy_score
predicted=[]
print(sentiment_predict[0])
for i in sentiment_predict:
    print(i)
    if i=="negative":
        predicted.append(0)
    else:
        predicted.append(1)
#print(predicted)
real_data=[]


positive
positive
negative
positive
positive
positive
negative
positive
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
negative
positive
positive
negative
positive
positive
negative
negative
negative
positive
positive
positive
negative
negative
negative
positive
positive
positive
negative
negative
positive
negative
positive
positive
positive
negative
negative
negative
positive
positive
positive
positive
negative
negative
negative
positive
positive
negative
positive
positive
negative
positive
negative
positive
negative
positive
positive
negative
positive
negative
positive
positive
positive
positive
positive
negative
positive
positive
positive
negative
negative
positive
positive
positive
positive
negative
negative
positive
positive
positive
negative
negative
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
positive
positive
negative
positive
negative
negative
negative
negative
negative
positive
positive
p

In [62]:
def predict_sentiment(text,model):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    #print("Predicted label: ", sentiment_label[1][prediction])
    return sentiment_label[1][prediction]

sentiment_predict=[]
for i in test_data_pred:
  sentiment_predict.append(predict_sentiment(i,model))



In [63]:
from sklearn.metrics import accuracy_score
predicted=[]
#print(sentiment_predict[0])
for i in sentiment_predict:
    #print(i)
    if i=="negative":
        predicted.append(0)
    else:
        predicted.append(1)
#print(predicted)
real_data=[]
for i in real_label[0]:
    #print(i)
    real_data.append(i)
#real_data=real_data
print(accuracy_score(real_data,predicted))

0.47576045627376423


In [None]:
n_genes = padded_sequence.shape[1]
print(n_genes)
n_generations = 2
n_population = 2
crossover_probability = 0.6
mutation_probability = 0.2


def setup_toolbox():
    # container for individuals
    creator.create('FitnessMax', base.Fitness, weights=(1.0,))
    creator.create('Individual', list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox()
    toolbox.register(
        'individual_generator_function',
        random.randint, 0, 1
    )
    # method to populate individual
    toolbox.register(
        'individual_generator',
        tools.initRepeat,
        creator.Individual,
        toolbox.individual_generator_function,
        n_genes
    )
    # method to create population
    toolbox.register(
        'population_generator',
        tools.initRepeat,
        list,
        toolbox.individual_generator
    )
    # fitness calculation
    toolbox.register(
        'evaluate', compute_fitness_score
    )
    # selection
    toolbox.register(
        'select', tools.selTournament, tournsize=3
    )
    # crossover
    toolbox.register('mate', tools.cxOnePoint)
    # mutation
    toolbox.register(
        'mutate',
        tools.mutFlipBit,
        indpb=mutation_probability
    )
    return toolbox

200
