### 1. Importing Libraries

In [41]:
import re
import nltk
import joblib
import pickle
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer=PorterStemmer()

### 2. Importing Dataset

In [2]:
df=pd.read_csv('tweets/twitter_training.csv',header=None)
df=df.rename({0:'id',1:'company',2:'sentiment',3:'raw_tweet'},axis=1)

df=df[df['sentiment']!='Irrelevant']
df=df[df['sentiment']!='Neutral']
df=df.dropna()
tweets=df['raw_tweet'].values

In [3]:
df.head()

Unnamed: 0,id,company,sentiment,raw_tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


### 3. Text Preprocessing

In [4]:
processed_tweets=[]
for tweet in tqdm(tweets):
    tweet=re.sub(r'[^a-zA-Z]',' ',tweet)
    tweet=tweet.lower()
    tweet=[word for word in tweet.split(' ') if word not in stopwords.words('english')]
    tweet=[stemmer.stem(word) for word in tweet]
    tweet=[word for word in tweet if len(word)!=0]
    tweet=' '.join(tweet)
#     print(tweet)
    processed_tweets.append(tweet)

100%|███████████████████████████████████████████████████████████████████████████| 43013/43013 [02:39<00:00, 268.93it/s]


In [5]:
processed_tweets

['im get borderland murder',
 'come border kill',
 'im get borderland kill',
 'im come borderland murder',
 'im get borderland murder',
 'im get borderland murder',
 'spent hour make someth fun know huge borderland fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pic twitter com mlsi wf jg',
 'spent coupl hour someth fun know huge borderland fan maya one favorit charact decid make wallpap pc origin pictur compar creation made fun pic twitter com mlsi wf jg',
 'spent hour someth fun know huge borderland fan maya one favorit charact',
 'spent hour make someth fun know huge rhandlerr fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pic twitter com mlsi wf jg',
 'spent hour make someth fun know huge rhandlerr fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pic twitter com mlsi wf jg',
 '',
 'first borderland session long time actual realli satisfi combat experi got reall

### 4. TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer()

enc_tweets=tfidf.fit_transform(processed_tweets)

df_=pd.DataFrame(enc_tweets.toarray(),columns=tfidf.get_feature_names_out())

df_

Unnamed: 0,aa,aaa,aaaaaaaaaaaa,aaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaag,aaaaaaaaaag,aajtak,aamaavpjyc,aaron,...,zyfapoihpi,zynk,zyot,zywz,zz,zzbk,zzgi,zzp,zzvfsrhewg,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
enc_tweets.toarray().shape

(43013, 13699)

### 5.Saving the Model and Encoding

In [7]:
joblib.dump(tfidf,'tfidf_model.joblib')

# df.to_csv('tfidf_enc.csv',index=False)

# print('model is saved with name tfidf_model.joblib!')
# print('Encodings are saved with name tfidf_enc.csv')

# joblib.load('tfidf_model.joblib')

['tfidf_model.joblib']

### 6. Preprocess the Data for Model Building

In [8]:
x_train=enc_tweets.toarray()
y_train=pd.get_dummies(df['sentiment']).values[:,1:]

### 7. Split the dataset into training and testing

In [9]:
# from sklearn.model_selection import train_test_split

# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)



### 8. Model Training 

In [10]:
from sklearn.naive_bayes import MultinomialNB

mdl=MultinomialNB()

mdl.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


### 7. Preparing Testing Data

In [14]:
df_test=pd.read_csv('tweets/twitter_validation.csv',header=None)
df_test=df_test.rename({0:'id',1:'company',2:'sentiment',3:'raw_tweet'},axis=1)

df_test=df_test[df_test['sentiment']!='Irrelevant']
df_test=df_test[df_test['sentiment']!='Neutral']
df_test=df_test.dropna()
test_tweets=df_test['raw_tweet'].values

In [15]:
test_tweets

array(['@Microsoft Why do I pay for WORD when it functions so poorly on my @SamsungUS Chromebook? 🙄',
       "CSGO matchmaking is so full of closet hacking, it's a truly awful game.",
       'Hi @EAHelp I’ve had Madeleine McCann in my cellar for the past 13 years and the little sneaky thing just escaped whilst I was loading up some fifa points, she took my card and I’m having to use my paypal account but it isn’t working, can you help me resolve it please?',
       'Thank you @EAMaddenNFL!! \n\nNew TE Austin Hooper in the ORANGE & BROWN!! \n\n#Browns | @AustinHooper18 \n\n pic.twitter.com/GRg4xzFKOn',
       'Rocket League, Sea of Thieves or Rainbow Six: Siege🤔? I love playing all three on stream but which is the best? #stream #twitch #RocketLeague #SeaOfThieves #RainbowSixSiege #follow',
       'my ass still knee-deep in Assassins Creed Odyssey with no way out anytime soon lmao',
       'FIX IT JESUS ! Please FIX IT ! What In the world is going on here.  @PlayStation @AskPlayStation @

In [16]:
processed_tweets=[]
for tweet in tqdm(test_tweets):
    tweet=re.sub(r'[^a-zA-Z]',' ',tweet)
    tweet=tweet.lower()
    tweet=[word for word in tweet.split(' ') if word not in stopwords.words('english')]
    tweet=[stemmer.stem(word) for word in tweet]
    tweet=[word for word in tweet if len(word)!=0]
    tweet=' '.join(tweet)
#     print(tweet)
    processed_tweets.append(tweet)
    
enc_tweets=tfidf.transform(processed_tweets)

df_=pd.DataFrame(enc_tweets.toarray(),columns=tfidf.get_feature_names_out())

df_

100%|███████████████████████████████████████████████████████████████████████████████| 543/543 [00:02<00:00, 253.07it/s]


Unnamed: 0,aa,aaa,aaaaaaaaaaaa,aaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaag,aaaaaaaaaag,aajtak,aamaavpjyc,aaron,...,zyfapoihpi,zynk,zyot,zywz,zz,zzbk,zzgi,zzp,zzvfsrhewg,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
processed_tweets

['microsoft pay word function poorli samsungu chromebook',
 'csgo matchmak full closet hack truli aw game',
 'hi eahelp madelein mccann cellar past year littl sneaki thing escap whilst load fifa point took card use paypal account work help resolv pleas',
 'thank eamaddennfl new te austin hooper orang brown brown austinhoop pic twitter com grg xzfkon',
 'rocket leagu sea thiev rainbow six sieg love play three stream best stream twitch rocketleagu seaofthiev rainbowsixsieg follow',
 'ass still knee deep assassin creed odyssey way anytim soon lmao',
 'fix jesu pleas fix world go playstat askplayst playstationsup treyarch callofduti neg silver wolf error code pic twitter com ziryhrf q',
 'profession dota scene fuck explod complet welcom get garbag',
 'itch assassin tccgif assassinscreedblackflag assassinscre thecapturedcollect pic twitter com vv mogtcjw',
 'fredtjoseph hey fred comcast cut cabl verizon stay call shut pic twitter com cpwsrmuedg',
 'nba k game suck second left team intent fo

In [22]:
x_test=enc_tweets.toarray()
y_test=pd.get_dummies(df_test['sentiment'])['Positive'].values

In [24]:
y_pred=mdl.predict(x_test)

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9189686924493554

In [31]:
len(df[df['sentiment']!='Negative'])

20655

In [30]:
len(df_test[df_test['sentiment']!='Negative'])

277

In [32]:
len(df[df['sentiment']=='Negative'])

22358

In [33]:
len(df_test[df_test['sentiment']=='Negative'])

266

### 8. Model Saving

In [34]:
import pickle

In [37]:
pickle.dump(mdl,open('model.mdl','wb'))

In [39]:
mdl=pickle.load(open('model.mdl','rb'))

In [40]:
accuracy_score(y_test,mdl.predict(x_test))

0.9189686924493554