In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# Sample dataset
data = {
    'Text': [
        "I love the new design of this product!",
        "The service was terrible and slow.",
        "It's okay, nothing special.",
        "I'm so happy with the results!",
        "This is the worst experience I've ever had.",
        "Absolutely fantastic, will recommend to others.",
        "The product quality is very poor.",
        "Not bad, but could be better.",
        "I'm thrilled with this purchase!",
        "This was a waste of money.",
        "Customer support was extremely helpful.",
        "The color is nice, but the material feels cheap.",
        "I feel indifferent about this product.",
        "The performance exceeded my expectations.",
        "I'm never buying from this store again.",
        "The packaging was great, very professional.",
        "Terrible! It broke after one use.",
        "This is a decent product for the price.",
        "I'm impressed with the fast delivery.",
        "Disappointing, I expected much better quality."
    ],
    'Sentiment': [
        'Positive', 'Negative', 'Neutral', 'Positive', 'Negative',
        'Positive', 'Negative', 'Neutral', 'Positive', 'Negative',
        'Positive', 'Neutral', 'Neutral', 'Positive', 'Negative',
        'Positive', 'Negative', 'Neutral', 'Positive', 'Negative'
    ]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Text,Sentiment
0,I love the new design of this product!,Positive
1,The service was terrible and slow.,Negative
2,"It's okay, nothing special.",Neutral
3,I'm so happy with the results!,Positive
4,This is the worst experience I've ever had.,Negative
5,"Absolutely fantastic, will recommend to others.",Positive
6,The product quality is very poor.,Negative
7,"Not bad, but could be better.",Neutral
8,I'm thrilled with this purchase!,Positive
9,This was a waste of money.,Negative


In [4]:
# Preprocess the data
def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text.lower())

    # Stopwords removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return stemmed_tokens

In [6]:
X = [preprocess(text) for text in data['Text']]
y = data['Sentiment']

In [8]:
X = [' '.join(tokens) for tokens in X]
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [10]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Create a Naive Bayes classifier
nb = MultinomialNB()
# Train the classifier
nb.fit(X_train, y_train)

# Make predictions
y_pred = nb.predict(X_test)


In [13]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.5
Precision: 0.5833333333333334
Recall: 0.5
F1-score: 0.4583333333333333
