In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('twitter_data.csv')

In [3]:
df

Unnamed: 0,sentiment,Tweet content
0,Neutral,I mentioned on Facebook that I was struggling ...
1,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,Negative,@Microsoft Why do I pay for WORD when it funct...
3,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,Neutral,Now the President is slapping Americans in the...
...,...,...
75677,Positive,Just realized that the Windows partition of my...
75678,Positive,Just realized that my Mac window partition is ...
75679,Positive,Just realized the windows partition of my Mac ...
75680,Positive,Just realized between the windows partition of...


In [4]:
df.isnull().sum()

Unnamed: 0,0
sentiment,0
Tweet content,686


In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

Unnamed: 0,0
sentiment,0
Tweet content,0


In [7]:
df.shape

(74996, 2)

In [8]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Neutral,31440
Negative,22624
Positive,20932


In [9]:
X = df['Tweet content']
y = df['sentiment']

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
X_vectorized = vectorizer.fit_transform(X)

In [17]:
X_vectorized

<74996x285517 sparse matrix of type '<class 'numpy.int64'>'
	with 1443269 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=12)

In [19]:
from sklearn.naive_bayes import MultinomialNB

In [20]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

In [23]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9054666666666666
              precision    recall  f1-score   support

    Negative       0.92      0.91      0.91      4452
     Neutral       0.90      0.91      0.91      6350
    Positive       0.89      0.89      0.89      4198

    accuracy                           0.91     15000
   macro avg       0.91      0.90      0.90     15000
weighted avg       0.91      0.91      0.91     15000



In [24]:
print(confusion_matrix(y_test,y_pred))

[[4045  278  129]
 [ 227 5810  313]
 [ 133  338 3727]]


In [25]:
import pickle

In [26]:
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [27]:
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)