In [15]:
# Naive_Bayes_(ComplementNB).py

# Step 1: လိုအပ်သော Libraries များ Import ပြုလုပ်ခြင်း
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Feature & Target Selection
# Dataset ကို load လုပ်ခြင်း (CSV file assumed with columns: 'review' and 'sentiment')
# ================================
# 2. Load Dataset
# ================================

# Change your file path
DATA_PATH = r"D:\AI_Project_2026\AI_2026\Day08  Naive Bayes\groupA\po\IMDB Dataset.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset Shape :", df.shape)

print("\nFirst 5 Rows")
print(df.head())

# Dataset preview
print("=== Dataset Preview ===")
print(df.head())

# Preprocessing: text ကို lowercase ပြောင်းခြင်း, NaN values ဖယ်ရှားခြင်း
df['review'] = df['review'].str.lower()
df.dropna(subset=['review', 'sentiment'], inplace=True)

# Sentiment ကို numeric label ပြောင်းခြင်း (positive=1, negative=0)
df['sentiment_label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Features နှင့် Target များ သတ်မှတ်ခြင်း
X = df['review']          # Feature
y = df['sentiment_label'] # Target

# Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: စာသားများကို ကိန်းဂဏန်းအဖြစ် ပြောင်းလဲခြင်း (TF-IDF Vectorization)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Step 5: Complement Naive Bayes Model တည်ဆောက်ခြင်းနှင့် Training
model = ComplementNB()
model.fit(X_train_vect, y_train)

# Step 6: Model ၏ တိကျမှုကို စစ်ဆေးခြင်း (Accuracy Evaluation)
y_pred = model.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}\n")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    return text

df['review'] = df['review'].apply(clean_text).str.lower()

# Step 7: စာသားအသစ်များဖြင့် လက်တွေ့စမ်းသပ်ခြင်း (Manual Testing)
# new_reviews = [
#     "This product is amazing! I love it.",
#     "The service was awful and I am disappointed."
# ]

new_reviews = [
    "This product is amazing! I love it.",
    "Absolutely fantastic service and quality!",
    "I am very satisfied with my purchase.",
    "Highly recommend this to everyone!",
    "Exceeded my expectations, very happy!"
]

new_reviews_vect = vectorizer.transform(new_reviews)
predictions = model.predict(new_reviews_vect)

for review, pred in zip(new_reviews, predictions):
    label = "positive" if pred == 1 else "negative"
    print(f"Review: {review}\nPredicted Sentiment: {label}\n")

Dataset Shape : (50000, 2)

First 5 Rows
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
=== Dataset Preview ===
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Model Accuracy: 0.8839

Confusion Matrix:
[[4456  505]
 [ 656 4383]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.