In [136]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

In [137]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [138]:
data = pd.read_csv(r"C:\Users\Predator\Downloads\sentiment_analysis.csv", encoding='utf-8')

In [139]:
data.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


In [140]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [141]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [142]:
data['processed_text'] = data['text'].apply(preprocess_text)

In [143]:
data

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform,processed_text
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter,great day look like dream
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook,feel sorry miss sea beach
2,2017,8,18,night,Don't angry me,negative,Facebook,dont angry
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook,attend class listening teacher reading slide n...
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram,want go let go
...,...,...,...,...,...,...,...,...
494,2015,10,18,night,"According to , a quarter of families under six...",negative,Twitter,according quarter family six live poverty
495,2021,2,25,morning,the plan to not spend money is not going well,negative,Instagram,plan spend money going well
496,2022,5,30,noon,uploading all my bamboozle pictures of facebook,neutral,Facebook,uploading bamboozle picture facebook
497,2018,8,10,night,congratulations ! you guys finish a month ear...,positive,Twitter,congratulation guy finish month early booo


In [144]:
X = data['processed_text']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [145]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=5, max_df=0.7)),
    ('classifier', MultinomialNB())
])

In [146]:
pipeline.fit(X_train, y_train)

In [147]:
y_pred = pipeline.predict(X_test)

In [148]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.55

Confusion Matrix:
[[15 19  2]
 [ 2 22  6]
 [ 3 13 18]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.42      0.54        36
     neutral       0.41      0.73      0.52        30
    positive       0.69      0.53      0.60        34

    accuracy                           0.55       100
   macro avg       0.62      0.56      0.55       100
weighted avg       0.63      0.55      0.55       100



In [149]:
new_reviews = [
    "What a great morning! I loved it.",
    "I very short tempered so don't make me angry!!"
]

In [150]:
processed_new_reviews = [preprocess_text(review) for review in new_reviews]

# Make predictions
new_predictions = pipeline.predict(processed_new_reviews)

for review, sentiment in zip(new_reviews, new_predictions):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")

Review: What a great morning! I loved it.
Predicted Sentiment: positive

Review: I very short tempered so don't make me angry!!
Predicted Sentiment: negative

