## Importing Libraries

In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.stem import PorterStemmer
import re
import string
from nltk.corpus import stopwords


## Importing dataset and renaming columns

In [2]:
df = pd.read_csv('twitter_training.csv', header=None)
df= df.rename({0:'TweetID',1:"Entity",2:"Sentiment",3:"Tweet"}, axis = 1)


### Data Preprocessing

In [8]:
df = df[df['Sentiment']!="Irrelevant"]
df = df[df['Sentiment']!="Neutral"]
df =df.dropna()

In [3]:
df['Sentiment'].unique()


array(['Positive', 'Negative'], dtype=object)

In [4]:
df.head()

Unnamed: 0,TweetID,Entity,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


## Text Preprocessing

In [5]:
tweets = df['Tweet'].values

In [6]:
tweets[0]

'im getting on borderlands and i will murder you all ,'

In [7]:
processed_tweets = []
stemmer = PorterStemmer()
for tweet in tqdm(tweets):
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
        tweet = [word.lower() for word in tweet.split(' ') if word.lower() not in stopwords.words('english')]
        tweet = [stemmer.stem(word) for word in tweet]
        tweet = [word for word in tweet if len(word)!=0]
        tweet = ' '.join(tweet)
        processed_tweets.append(tweet)

processed_tweets

100%|███████████████████████████████████████████████████████████████████████████| 43013/43013 [07:05<00:00, 100.98it/s]


['im get borderland murder',
 'come border kill',
 'im get borderland kill',
 'im come borderland murder',
 'im get borderland 2 murder',
 'im get borderland murder',
 'spent hour make someth fun dont know huge borderland fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pictwittercommlsi5wf9jg',
 'spent coupl hour someth fun dont know im huge borderland fan maya one favorit charact decid make wallpap pc here origin pictur compar creation made fun pictwittercom mlsi5wf9jg',
 'spent hour someth fun dont know im huge borderland fan maya one favorit charact',
 'spent hour make someth fun dont know huge rhandlerr fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pictwittercommlsi5wf9jg',
 '2010 spent hour make someth fun dont know huge rhandlerr fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pictwittercommlsi5wf9jg',
 '',
 'first borderland session long time actual realli

In [9]:
len(processed_tweets)

43013

## Preprocessing the Data for Model Building

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [11]:
enc_tweets = tfidf.fit_transform(processed_tweets)

In [12]:
tf = pd.DataFrame(enc_tweets.toarray(), columns=tfidf.get_feature_names_out())

In [13]:
tf

Unnamed: 0,00,000,00011,00014,00015,00015cant,00016,00054,00105,00107,...,это,юууу,ясс,اunk,اللعبه,حبيت,خلاص,خلاصunk,٥υ,ℐℓ٥
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
len(df[df['Sentiment']=='Negative'])

22358

In [16]:
len(df[df['Sentiment']=='Positive'])

20655

In [18]:
len(df['Sentiment'])

43013

## Prepairing Features and Labels

In [19]:
x = enc_tweets.toarray()
y = pd.get_dummies(df['Sentiment']).values[:,1:]

In [20]:
x.shape

(43013, 18452)

In [21]:
y.shape

(43013, 1)

## Splitting Data into training and testing parts

In [22]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size=0.2)

In [23]:
from sklearn.naive_bayes import MultinomialNB
mdl = MultinomialNB()
mdl.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


## Model Evaluation

In [24]:
y_pred = mdl.predict(x_test)

In [25]:
correct = 0
wrong = 0
for pred,act in zip(y_pred,y_test):
    if pred==act:
        correct+=1
    else:
        wrong+=1

In [26]:
correct

7593

In [27]:
wrong

1010

In [32]:
print("Accuracy: ", (correct/(correct+wrong))*100,"%")

Accuracy:  88.25990933395327 %


### Model Accuracy

In [33]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)*100

In [34]:
accuracy

88.25990933395327

We can se that the model can predict the sentiment of tweets with whooping accuravy of 88.25%. 