<a href="https://colab.research.google.com/github/Akompalwad/Sentimental-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I have referred [this](https://arxiv.org/pdf/1809.08651.pdf) research paper for making the model

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import re
from nltk.stem.porter import *
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [None]:
# this section is needed only if you get some error in nltk, it wouldnt hurt to
# to run it though
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv('/content/drive/My Drive/datasets/train_twitter.csv',header=0)

In [None]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
df = df[['label','tweet']]

In [None]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [None]:
#This tells us about the distribution of data
df.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

**It can be seen that the data is highly imbalanced and number of positive tweets
is too high than number of negative tweets
For a good deep learning model it is really helpful when both the tweets are balanced.
  It is highly important that the data is not skewed.**

In [None]:
df_positive = df[df['label']==0]
df_negative = df[df['label']==1]

In [None]:
df_train_positive = df_positive.sample(6000)
df_train_negative = df_negative
print("Train positive shape",df_train_positive.shape)
print("Train negative shape",df_train_negative.shape)

Train positive shape (6000, 2)
Train negative shape (2242, 2)


In [None]:
train_df = pd.concat([df_train_positive,df_train_negative])
print(train_df.shape)
print("\n")
train_df.head()

(8242, 2)




Unnamed: 0,label,tweet
19622,0,happy euro day ð¬
1856,0,reading trumps speech transcripts. can't tell ...
16392,0,meal before the premiere! mmm #lvff #nsc #sh...
21020,0,"uh oh, warranty on @user humanoid is apparentl..."
26582,0,@user because lebron cried when he got trash t...


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8242 entries, 19622 to 31960
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    8242 non-null   int64 
 1   tweet    8242 non-null   object
 2   tweet_n  8242 non-null   object
dtypes: int64(1), object(2)
memory usage: 257.6+ KB


In [None]:
def clear_texts(tweet,remove_pattern):
  r = re.findall(remove_pattern,tweet)
  for i in r:
    tweet = re.sub(i,'',tweet)
  return tweet

In [None]:
#WE remove the @user handles in the data as they dont give any relevant information
train_df['tweet_n'] = np.vectorize(clear_texts)(train_df['tweet'],"@[\w]*")
#Now we remove the special characters and punctuations
train_df['tweet_n'] = train_df['tweet_n'].str.replace("[^a-zA-Z#]"," ")
#we remove words with length less than 3 as most of them are just helping verbs and dont account for important data
train_df['tweet_n'] = train_df['tweet_n'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
#make a list of tweets
train_df['tweet_n'] = train_df['tweet_n'].apply(lambda x:x.split())
#remove prefixes and suffixes from the words
stemmer = PorterStemmer()
train_df['tweet_n']= train_df['tweet_n'].apply(lambda x:[stemmer.stem(i) for i in x])
#return the list from of tweets back to string
train_df['tweet_n'] = train_df['tweet_n'].apply(lambda x:str(" ".join(i for i in x)))

In [None]:
train_df.head()

Unnamed: 0,label,tweet,tweet_n
19622,0,happy euro day ð¬,happi euro
1856,0,reading trumps speech transcripts. can't tell ...,read trump speech transcript tell read year bo...
16392,0,meal before the premiere! mmm #lvff #nsc #sh...,meal befor premier #lvff #nsc #shofilm #womeni...
21020,0,"uh oh, warranty on @user humanoid is apparentl...",warranti humanoid appar exactli year himself #...
26582,0,@user because lebron cried when he got trash t...,becaus lebron cri when trash talk down then le...


In [None]:
x_train,x_test,y_train,y_test = train_test_split(train_df['tweet_n'],train_df['label'],test_size = 0.2, random_state = 42)

In [110]:
pipeline = Pipeline(
    [
     ('bow',CountVectorizer(ngram_range=(1,3))),
     ('tfidf',TfidfTransformer(norm='l2')),
     ('classifier',LogisticRegression(C=100,solver = 'liblinear')),
    ]
)
pipeline.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LogisticRegression(C=100, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [111]:
predictions = pipeline.predict(x_test)

In [112]:
print(classification_report(predictions,y_test))
print ('\n')
print(confusion_matrix(predictions,y_test))
print(accuracy_score(predictions,y_test))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      1266
           1       0.74      0.86      0.79       383

    accuracy                           0.90      1649
   macro avg       0.85      0.88      0.86      1649
weighted avg       0.90      0.90      0.90      1649



[[1150  116]
 [  55  328]]
0.8963007883565798


##The above model is used to make predictions about test dataset from [Analytics Vidhya](https://datahack.analyticsvidhya.com/contest/practice-problem-twitter-sentiment-analysis/)

In [113]:
test_df = pd.read_csv(r'/content/drive/My Drive/datasets/test_tweets.csv')
test_df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [114]:
#WE remove the @user handles in the data as they dont give any relevant information
test_df['tweet_n'] = np.vectorize(clear_texts)(test_df['tweet'],"@[\w]*")
#Now we remove the special characters and punctuations
test_df['tweet_n'] = test_df['tweet_n'].str.replace("[^a-zA-Z#]"," ")
#we remove words with length less than 3 as most of them are just helping verbs and dont account for important data
test_df['tweet_n'] = test_df['tweet_n'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
#make a list of tweets
test_df['tweet_n'] = test_df['tweet_n'].apply(lambda x:x.split())
#remove prefixes and suffixes from the words
stemmer = PorterStemmer()
test_df['tweet_n']= test_df['tweet_n'].apply(lambda x:[stemmer.stem(i) for i in x])
#return the list from of tweets back to string
test_df['tweet_n'] = test_df['tweet_n'].apply(lambda x:str(" ".join(i for i in x)))

In [115]:
test_df.head()

Unnamed: 0,id,tweet,tweet_n
0,31963,#studiolife #aislife #requires #passion #dedic...,#studiolif #aislif #requir #passion #dedic #wi...
1,31964,@user #white #supremacists want everyone to s...,#white #supremacist want everyon #bird #movi here
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe way heal your #acn #altwaystoh #healthi #...
3,31966,is the hp and the cursed child book up for res...,curs child book reserv alreadi where when #har...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",#bihday amaz hilari #nephew ahmir uncl dave lo...


In [116]:
result = pipeline.predict(test_df['tweet_n'])
result = pd.DataFrame(result)
result.to_csv('result16.csv')

##The code below this is for the api that we will be using for getting input in form of string and using our model on it

In [102]:
test_string = "Thank you @rseroter for the first ever, 10-tweet, VM to container App migration tutorial. It's that easy! (awesome job  @googlecloudteam) "

In [117]:
import pickle
pickle_file = 'sk_model.pkl'
with open(pickle_file,'wb') as file:
  pickle.dump(pipeline,file)

In [118]:
with open(pickle_file, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LogisticRegression(C=100, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [103]:
def text_preprocessing(string):
  def clear_texts(tweet,remove_pattern):
    r = re.findall(remove_pattern,tweet)
    for i in r:
      tweet = re.sub(i,'',tweet)
    return tweet
  #WE remove the @user handles in the data as they dont give any relevant information
  string = np.vectorize(clear_texts)(np.array(string),"@[\w]*")
  #Now we remove the special characters and punctuations
  string= str(string).replace("[^a-zA-Z#]"," ")
  #make a list of tweets
  string = string.split()
  #we remove words with length less than 3 as most of them are just helping verbs and dont account for important data
  string = ' '.join([w for w in string if len(w)>3])
  #remove prefixes and suffixes from the words
  stemmer = PorterStemmer()
  string= [stemmer.stem(i) for i in string.split()]
  #return the list from of tweets back to string
  string = " ".join(i for i in string)
  return string

In [104]:
res = text_preprocessing(test_string)

In [105]:
res = [res]

In [106]:
Pickled_LR_Model.predict(res)

array([0])