## Importing Modules

In [1]:
import pandas as pd
import numpy as np
import math
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv(r"F:\AI and ML Diploma\NLP\twitter_sentiment_analysis\twitter_training.csv",header=None)
test = pd.read_csv(r"F:\AI and ML Diploma\NLP\twitter_sentiment_analysis\twitter_validation.csv",header=None)

In [3]:
train.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
train.nunique()

0    12447
1       32
2        4
3    69491
dtype: int64

## Data Preprocessing

In [5]:
train = train[(train[2]=='Positive') | (train[2]=='Negative')]
test = test[(test[2]=='Positive') | (test[2]=='Negative')]

In [6]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [7]:
tweet = train[3]
labels = train[2]

In [8]:
def clean_text(tweet):
    tweet= re.sub('(#|@)\w*',"",tweet) # remove hashtags and mentions
    tweet= re.sub("https?:\/\/\S+","",tweet) # remove links
    tweet= re.sub("(\?|!)+","",tweet) # remove question marks "???" or exclamatiomn marks "!!!"
    tweet= re.sub("\s\d+\s","",tweet) # remove numbers
    tweet= re.sub("(\.|\,)+","",tweet) # remove punctuation "." or ","
    tweet= re.sub("^\s+","",tweet)   # remove leading spaces (spaces at the beginning)
    tweet= re.sub("\s+$","",tweet) #  remove trailing spaces (spaces at the end)
    return tweet

In [9]:
tweets = tweet.apply(clean_text)

In [10]:
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [11]:
stop_words = set(stopwords.words('english'))

def process_sentence(tweets):
    clean_tweets=[]
    for tweet in tweets:
        tweet=clean_text(tweet)
        tweet=tweet.split()
        c_tweet=[word.lower() for word in tweet if word.lower() not in stop_words] #remove stop wods & convert to lower casee 
        #stemming 
        ps=PorterStemmer()
        clean_tweet=[ps.stem(word) for word in c_tweet]#convert word to  base 
        clean_tweets.append(clean_tweet)

    return clean_tweets

In [12]:
clean_tweets = process_sentence(tweets)

In [13]:
labels = pd.get_dummies(labels , drop_first=True)

## Building the Probabilistic Model

In [14]:
def build_freqs (filtered_sentence, ys):

    yslist = np. squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist, filtered_sentence):
        for word in tweet:
            pair = (word, y)
            if pair in freqs:
                freqs [pair] += 1
            else:
                freqs [pair] = 1
    return freqs

In [15]:
freqs = build_freqs(clean_tweets,labels)

In [16]:
freqs

{('im', True): 182,
 ('get', True): 1286,
 ('borderland', True): 343,
 ('murder', True): 39,
 ('come', True): 560,
 ('border', True): 25,
 ('kill', True): 269,
 ('borderlandsand', True): 23,
 ('spent', True): 37,
 ('hour', True): 126,
 ('make', True): 665,
 ('someth', True): 188,
 ('fun', True): 911,
 ('know', True): 507,
 ('huge', True): 145,
 ('fan', True): 163,
 ('maya', True): 19,
 ('one', True): 1064,
 ('favorit', True): 380,
 ('charact', True): 200,
 ('decid', True): 81,
 ('wallpap', True): 14,
 ('pc', True): 146,
 ('origin', True): 145,
 ('imag', True): 24,
 ('versu', True): 8,
 ('creation', True): 5,
 ('made', True): 295,
 (':)', True): 65,
 ('enjoy', True): 437,
 ('pictwittercom/mlsi5wf9jg', True): 3,
 ('coupl', True): 65,
 ("i'm", True): 1188,
 ("here'", True): 41,
 ('pictur', True): 33,
 ('compar', True): 37,
 ('made:)', True): 1,
 ('pictwittercom', True): 275,
 ('/', True): 2181,
 ('mlsi5wf9jg', True): 1,
 ('rhandlerr', True): 582,
 ('2010', True): 9,
 ('first', True): 458,

In [17]:
V = []

for word in freqs:
    V.append(word[0])
    
V_set = set(V)

In [36]:
len(V_set)

25636

In [19]:
N_pos=0
N_neg =0

for word in freqs:
    if word[1]:
        N_pos += freqs[word]
    else:
        N_neg += freqs[word]

In [35]:
N_neg,N_pos

(245078, 201079)

In [21]:
cond_prob = {}

for word in V_set:
    w1 = (freqs.get((word,True),0) +1) / (N_pos + len(V_set))
    w2 = (freqs.get((word,False),0) +1) / (N_neg + len(V_set))
    ratio = w1 / w2
    log_likelihood = math.log(ratio)
    cond_prob[word] = log_likelihood

In [22]:
cond_prob

{'h5': -0.5157779895730061,
 '"shit"': -0.5157779895730061,
 'redemptionsound': 1.969128660214994,
 'going-0': -0.5157779895730061,
 'buynot': -0.5157779895730061,
 'burden': -0.738921540887216,
 'cipher': 1.7868071034210395,
 'month:new': 1.2759814796550488,
 'babe': -1.9020723506928967,
 'pictwittercom/izoafew163': -1.2089251701329515,
 'yourdemo': -0.5157779895730061,
 'pictwittercom/gfk7wn5bow': -1.2089251701329515,
 'facebook-remov': -0.5157779895730061,
 'playstationupd': 0.8705163715468844,
 'kati': 0.023218511159680853,
 'technique:': 0.8705163715468844,
 "lincoln'": -0.5157779895730061,
 'fortim': 0.8705163715468844,
 'pictwittercom/axjqc4pift': -1.2089251701329515,
 'pictwittercom/mcgptlss5': 1.5636635521068298,
 'theto': -1.614390278241116,
 'inwould': -1.4320687214471612,
 'lack': -1.4540476281659365,
 'handsom': 1.8195969262440304,
 'highmountain': 1.969128660214994,
 'titanfalldo': -0.5157779895730061,
 '4-inch': 0.8705163715468844,
 'careless': -0.5157779895730061,
 'syn

In [23]:
train[2].value_counts(normalize=True)

2
Negative    0.519796
Positive    0.480204
Name: proportion, dtype: float64

In [24]:
T_pos= 0.480204
T_neg = 0.519796

prior = math.log (T_pos/T_neg)

In [25]:
def Naive_Bayes(Tweet):
    sum = prior
    for word in Tweet:
        sum += cond_prob.get(word,0)
    if sum>0:
        return True
    else:
        return False

In [26]:
y_pred = [Naive_Bayes(tweet) for tweet in clean_tweets]

## Testing the Model

In [28]:
print(f"Accuracy: {(accuracy_score(labels,y_pred)*100).round()} %")

Accuracy: 90.0 %


In [29]:
def Sentiment_Analysis(string):
    clean_text(string)
    process_sentence(string)
    if Naive_Bayes(string):
        print("Positive Statemenet")
    else:
        print("Negative Statemenent")

In [33]:
Sentiment_Analysis("I want to kill you")

Negative Statemenent


In [34]:
Sentiment_Analysis("I love you")

Positive Statemenet
