In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the original dataset
data = pd.read_csv('shuffled_redditinput.csv')
X = data['text']
y = data['emotion']

# Preprocess the data using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

# Evaluate the model on the test set
y_test_pred = nb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification report:\n", classification_report(y_test, y_test_pred))

# Load the new dataset
new_data = pd.read_csv('shuffled_tweetspredit.csv')
X_new = new_data['text']

# Preprocess the new text data using the same TfidfVectorizer
X_new_vectorized = vectorizer.transform(X_new)

# Make predictions on the new dataset
y_new_pred = nb_clf.predict(X_new_vectorized)

# Print the predicted emotions for the new dataset
print("Predicted emotions:", y_new_pred)

# Load the true labels (emotions) for the new dataset
y_new_true = new_data['emotion']

# Calculate the accuracy
accuracy = accuracy_score(y_new_true, y_new_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report
report = classification_report(y_new_true, y_new_pred)
print("Classification report:\n", report)

Accuracy: 0.60
Classification report:
               precision    recall  f1-score   support

       angry       0.54      0.74      0.63      1649
     disgust       0.56      0.29      0.38      1044
        fear       0.77      0.23      0.35       636
       happy       0.62      0.84      0.71      1579
         sad       0.59      0.63      0.61      1376
    surprise       0.69      0.48      0.57      1084

    accuracy                           0.60      7368
   macro avg       0.63      0.54      0.54      7368
weighted avg       0.61      0.60      0.57      7368

Predicted emotions: ['happy' 'happy' 'angry' 'sad' 'sad' 'happy' 'angry' 'angry' 'happy'
 'happy' 'surprise' 'disgust' 'happy' 'surprise' 'angry' 'disgust' 'angry'
 'angry' 'angry' 'angry' 'angry' 'happy' 'happy' 'happy' 'happy' 'happy'
 'sad' 'surprise' 'happy' 'sad' 'sad' 'angry' 'happy' 'happy' 'angry'
 'sad' 'sad' 'happy' 'angry' 'happy' 'angry' 'happy' 'angry' 'sad' 'sad'
 'angry' 'happy' 'surprise' 'angry' 's