# Naive Bayes model for sentiment analysis
dataset: Tweet Sentiment Extractionfrom kaggle

In [238]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [220]:
df = pd.read_csv("tweet.csv")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [221]:
df['sentiment'].value_counts()

neutral     11118
positive     8582
negative     7964
Name: sentiment, dtype: int64

In [222]:
df.dtypes

textID           object
text             object
selected_text    object
sentiment        object
dtype: object

#### cleaning dataset by removing special characters, extra whitespaces converting into lowercases

In [223]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text) # Remove special characters
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = text.lower() # Convert to lowercase
    return text

In [224]:
df['cleaned_text'] = df['text'].astype(str).apply(clean_text)

In [225]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,i d have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldn t they put them on the rel...


In [226]:
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}

In [227]:
df['label'] = df['sentiment'].map(label_mapping)

In [228]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text,label
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,i d have responded if i were going,1
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego,0
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldn t they put them on the rel...,0


##### CountVectorizer converts text documents to a matrix of token counts, includes tokenization, stop words and vectorization

In [229]:
vectorizer = CountVectorizer(stop_words = 'english')
X = vectorizer.fit_transform(df['cleaned_text'])

In [230]:
y = df['label']

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [232]:
# Initialize the classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

In [233]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.6407012470630761
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.59      0.63      1605
           1       0.59      0.65      0.62      2225
           2       0.69      0.68      0.68      1703

    accuracy                           0.64      5533
   macro avg       0.65      0.64      0.64      5533
weighted avg       0.64      0.64      0.64      5533



In [237]:
review = "this is good"

# Clean and vectorize new reviews
new_text_cleaned = clean_text(review)
new_X = vectorizer.transform([new_text_cleaned])

# Predict sentiment
predictions = nb_classifier.predict(new_X)
status_mapping = {2: "positive", 1: "neutral", 0: "negative"}
recommendation_status = [status_mapping[pred] for pred in predictions]
print("Predicted class labels:", predictions)
print("Recommendation statuses:", recommendation_status)

Predicted class labels: [2]
Recommendation statuses: ['positive']


In [239]:
joblib.dump(nb_classifier, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']