In [1]:
# import libraries yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import warnings
warnings.filterwarnings('ignore')

# library buat text preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# library buat machine learning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline imporSt Pipeline

# download data dari NLTK buat teknisasi NLP
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
except:
    pass

print("All libraries imported successfully!")

All libraries imported successfully!


In [4]:
# function buat load data review
def load_review_data():
    """Load review data from available files"""
    reviews_data = []
    
    review_files = [
        'reviews_MSH Niacinamide Brightening Moisture Gel_20250629_140112.json',
        'reviews_Skintific 5X Ceramide Barrier Repair Moisture Gel Moisturizer_20250629_004723.json'
    ]
    
    for file in review_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)  # data is already a list of reviews
                product_name = file.split('_')[1:-2]
                product_name = ' '.join(product_name)
                
                # Add product info to each review
                for review in data:  # data is the list of reviews directly
                    review['product'] = product_name
                    reviews_data.append(review)
                    
            print(f"Loaded {len(data)} reviews from {product_name}")
        except FileNotFoundError:
            print(f"File {file} not found")
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    return reviews_data

# Load data
reviews_data = load_review_data()
print(f"\nTotal reviews loaded: {len(reviews_data)}")

# Convert to DataFrame
df = pd.DataFrame(reviews_data)
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

Loaded 352 reviews from MSH Niacinamide Brightening Moisture Gel
Loaded 669 reviews from Skintific 5X Ceramide Barrier Repair Moisture Gel Moisturizer

Total reviews loaded: 1021
DataFrame shape: (1021, 4)
Columns: ['review', 'page', 'scraped_at', 'product']


Unnamed: 0,review,page,scraped_at,product
0,"awalnya sih bagus bagus aja, tapi entah kenapa...",1,2025-06-29T13:59:29.047105,MSH Niacinamide Brightening Moisture Gel
1,Dibandingkan dengan moisturizer yang warna bir...,1,2025-06-29T13:59:29.047139,MSH Niacinamide Brightening Moisture Gel
2,Sampai sekarang belum ngerasaan kalo produk in...,1,2025-06-29T13:59:29.047144,MSH Niacinamide Brightening Moisture Gel
3,"Moisturizer paling bikin cerah di muka, memper...",1,2025-06-29T13:59:29.047172,MSH Niacinamide Brightening Moisture Gel
4,aku suka banget sama moisturizer Skintific yan...,1,2025-06-29T13:59:29.047184,MSH Niacinamide Brightening Moisture Gel


In [5]:
# Text Preprocessing Class
class IndonesianTextPreprocessor:
    def __init__(self):
        # Inisiasi stemmer dan stopword remover untuk Bahasa Indonesia
        factory = StemmerFactory()
        self.stemmer = factory.create_stemmer()
        
        stop_factory = StopWordRemoverFactory()
        self.stopword_remover = stop_factory.create_stop_word_remover()
        
        # Additional Indonesian stopwords
        self.additional_stopwords = {
            'yang', 'ini', 'itu', 'dengan', 'untuk', 'dari', 'ke', 'di', 'pada', 'oleh',
            'saya', 'aku', 'kamu', 'dia', 'mereka', 'kita', 'kalian',
            'sangat', 'sekali', 'banget', 'bgt', 'sih', 'deh', 'dong', 'loh', 'kok',
            'produk', 'skincare', 'wajah', 'kulit', 'moisturizer', 'gel'
        }
    
    def clean_text(self, text):
        """Clean and normalize text"""
        if pd.isna(text) or text == '':
            return ''
        
        # Convert kata ke lowercase
        text = str(text).lower()
        
        # Hapus URL (jika ada)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Hapus email address (jika ada)
        text = re.sub(r'\S+@\S+', '', text)
        
        # Hapus angka dan karakter khusus (kaya emoticon atau puctuations), sisakan kata dan spasi
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # hapus whitespace yang berlebih
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_text(self, text):
        """Tokenize text into words"""
        if not text:
            return []
        
        # Tokenisasi sederhana (dipisahin dari spasi)
        tokens = text.split()
        
        # filter token yang kependekan
        tokens = [token for token in tokens if len(token) >= 4 and len(token) <= 25]

        return tokens
    
    def remove_stopwords(self, tokens):
        """Remove stopwords from tokens"""
        # Join tokens to use Sastrawi stopword remover
        text = ' '.join(tokens)
        text = self.stopword_remover.remove(text)
        
        # Split back to tokens and remove additional stopwords
        tokens = text.split()
        tokens = [token for token in tokens if token not in self.additional_stopwords]
        
        return tokens
    
    def stem_tokens(self, tokens):
        """Apply stemming to tokens"""
        # Join tokens for stemming
        text = ' '.join(tokens)
        stemmed_text = self.stemmer.stem(text)
        
        return stemmed_text.split()
    
    def preprocess(self, text, include_stemming=True):
        """Complete preprocessing pipeline"""
        # bersihkan teks
        cleaned_text = self.clean_text(text)
        
        # tokenisasi
        tokens = self.tokenize_text(cleaned_text)
        
        # hapus stopwords
        tokens = self.remove_stopwords(tokens)
        
        # lakukan stemming - optional (default selalu true)
        if include_stemming:
            tokens = self.stem_tokens(tokens)
        
        # kembalikan ke string
        processed_text = ' '.join(tokens)
        
        return processed_text

# Inisiasi preprocessor
preprocessor = IndonesianTextPreprocessor()

# Lakukan preprocessing pada contoh review
if len(df) > 0:
    sample_text = df['review'].iloc[0] if 'review' in df.columns else str(df.iloc[0, 0])
    print("Original text:")
    print(sample_text)
    print("\nProcessed text:")
    print(preprocessor.preprocess(sample_text))
else:
    print("No data available for testing")

Original text:
awalnya sih bagus bagus aja, tapi entah kenapa lama kelamaan finishnya itu kayak keliatan berminyak gitu padahal pakainya dikit, terus kalau di pakai pagi hari jadi keliatan kusam banget, mencerahkannya juga perubahannya dikit banget, repurchase? no sorry

Processed text:
awal bagus bagus entah lama lama finishnya kayak liat minyak gitu padahal pakai dikit terus kalau pakai pagi hari jadi liat kusam cerah ubah dikit repurchase sorry
awal bagus bagus entah lama lama finishnya kayak liat minyak gitu padahal pakai dikit terus kalau pakai pagi hari jadi liat kusam cerah ubah dikit repurchase sorry


In [6]:
# Sentiment Labeling Class
class IndonesianSentimentLabeler:
    def __init__(self):
        # Indonesian positive words
        self.positive_words = {
            'bagus', 'baik', 'suka', 'senang', 'puas', 'mantap', 'keren', 'oke', 'cocok', 
            'recommended', 'recommend', 'love', 'amazing', 'perfect', 'excellent', 'good',
            'lembut', 'halus', 'wangi', 'fresh', 'nyaman', 'mudah', 'cepat', 'efektif',
            'moisturizing', 'hydrating', 'melembabkan', 'mencerahkan', 'melembutkan',
            'terjangkau', 'murah', 'worth', 'worthed', 'satisfied', 'puas', 'puass'
        }
        
        # Indonesian negative words
        self.negative_words = {
            'jelek', 'buruk', 'tidak', 'gak', 'ga', 'nggak', 'benci', 'kecewa', 'disappointed',
            'bad', 'terrible', 'awful', 'worst', 'hate', 'problem', 'masalah', 'error',
            'breakout', 'iritasi', 'gatal', 'perih', 'panas', 'berminyak', 'lengket',
            'berat', 'sulit', 'susah', 'mahal', 'expensive', 'overpriced', 'zonk',
            'mengecewakan', 'gagal', 'fail', 'worst'
        }
        
        # Negation words that can flip sentiment
        self.negation_words = {'tidak', 'nggak', 'gak', 'ga', 'bukan', 'never', 'no'}
    
    def count_sentiment_words(self, text):
        """Count positive and negative words in text"""
        if not text:
            return 0, 0
        
        words = text.lower().split()
        positive_count = 0
        negative_count = 0
        
        # Check for negations
        negated = False
        for i, word in enumerate(words):
            if word in self.negation_words:
                negated = True
                continue
            
            if word in self.positive_words:
                if negated:
                    negative_count += 1
                else:
                    positive_count += 1
                negated = False
            elif word in self.negative_words:
                if negated:
                    positive_count += 1
                else:
                    negative_count += 1
                negated = False
            else:
                # Reset negation if we encounter a non-sentiment word
                if negated and i > 0 and words[i-1] in self.negation_words:
                    continue
                negated = False
        
        return positive_count, negative_count
    
    def label_sentiment(self, text):
        """Label sentiment as positive, negative, or neutral"""
        pos_count, neg_count = self.count_sentiment_words(text)
        
        if pos_count > neg_count:
            return 'positive'
        elif neg_count > pos_count:
            return 'negative'
        else:
            # For neutral cases, we can use additional heuristics
            if 'ok' in text.lower() or 'oke' in text.lower():
                return 'neutral'
            elif len(text.split()) < 3:  # Very short reviews are often neutral
                return 'neutral'
            else:
                return 'neutral'

# Initialize sentiment labeler
sentiment_labeler = IndonesianSentimentLabeler()

# Apply preprocessing and sentiment labeling to the dataset
if len(df) > 0:
    # Determine the column with review text
    text_column = None
    for col in df.columns:
        if 'review' in col.lower() or 'text' in col.lower() or 'comment' in col.lower():
            text_column = col
            break
    
    if text_column is None:
        # Use the first string column
        for col in df.columns:
            if df[col].dtype == 'object':
                text_column = col
                break
    
    if text_column:
        print(f"Using column '{text_column}' for text analysis")
        
        # Apply preprocessing
        print("Applying text preprocessing...")
        df['processed_text'] = df[text_column].apply(lambda x: preprocessor.preprocess(str(x)))
        
        # Apply sentiment labeling
        print("Applying sentiment labeling...")
        df['sentiment'] = df[text_column].apply(lambda x: sentiment_labeler.label_sentiment(str(x)))
        
        # Display results
        print(f"\nDataset shape after processing: {df.shape}")
        print(f"\nSentiment distribution:")
        print(df['sentiment'].value_counts())
        
        # Show sample processed data
        print(f"\nSample processed data:")
        sample_df = df[['product', text_column, 'processed_text', 'sentiment']].head()
        for idx, row in sample_df.iterrows():
            print(f"\nProduct: {row['product']}")
            print(f"Original: {row[text_column][:100]}...")
            print(f"Processed: {row['processed_text'][:100]}...")
            print(f"Sentiment: {row['sentiment']}")
            print("-" * 50)
    else:
        print("No suitable text column found in the dataset")
else:
    print("No data available for processing")

Using column 'review' for text analysis
Applying text preprocessing...
Applying sentiment labeling...

Dataset shape after processing: (1021, 6)

Sentiment distribution:
sentiment
positive    658
neutral     325
negative     38
Name: count, dtype: int64

Sample processed data:

Product: MSH Niacinamide Brightening Moisture Gel
Original: awalnya sih bagus bagus aja, tapi entah kenapa lama kelamaan finishnya itu kayak keliatan berminyak ...
Processed: awal bagus bagus entah lama lama finishnya kayak liat minyak gitu padahal pakai dikit terus kalau pa...
Sentiment: positive
--------------------------------------------------

Product: MSH Niacinamide Brightening Moisture Gel
Original: Dibandingkan dengan moisturizer yang warna biru, yang ini ga terlalu moist di muka super kering aku...
Processed: banding warna biru terlalu moist muka super kering...
Sentiment: neutral
--------------------------------------------------

Product: MSH Niacinamide Brightening Moisture Gel
Original: Sampai seka

In [None]:
# Data Exploration and Visualization
def explore_sentiment_data(df):
    """Explore and visualize sentiment data"""
    if len(df) == 0:
        print("No data to explore")
        return
    
    plt.figure(figsize=(15, 10))
    
    # 1. Sentiment distribution
    plt.subplot(2, 3, 1)
    sentiment_counts = df['sentiment'].value_counts()
    plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%')
    plt.title('Sentiment Distribution')
    
    # 2. Sentiment by product
    plt.subplot(2, 3, 2)
    if 'product' in df.columns:
        sentiment_product = pd.crosstab(df['product'], df['sentiment'])
        sentiment_product.plot(kind='bar', stacked=True, ax=plt.gca())
        plt.title('Sentiment by Product')
        plt.xticks(rotation=45)
        plt.legend(title='Sentiment')
    
    # 3. Text length distribution
    plt.subplot(2, 3, 3)
    text_col = None
    for col in df.columns:
        if 'review' in col.lower() or 'text' in col.lower() or 'comment' in col.lower():
            text_col = col
            break
    
    if text_col:
        df['text_length'] = df[text_col].astype(str).str.len()
        plt.hist(df['text_length'], bins=30, alpha=0.7)
        plt.title('Text Length Distribution')
        plt.xlabel('Text Length (characters)')
        plt.ylabel('Frequency')
    
    # 4. Processed text length distribution
    plt.subplot(2, 3, 4)
    if 'processed_text' in df.columns:
        df['processed_length'] = df['processed_text'].str.len()
        plt.hist(df['processed_length'], bins=30, alpha=0.7, color='orange')
        plt.title('Processed Text Length Distribution')
        plt.xlabel('Processed Text Length (characters)')
        plt.ylabel('Frequency')
    
    # 5. Word count distribution by sentiment
    plt.subplot(2, 3, 5)
    if 'processed_text' in df.columns:
        df['word_count'] = df['processed_text'].str.split().str.len()
        for sentiment in df['sentiment'].unique():
            subset = df[df['sentiment'] == sentiment]['word_count']
            plt.hist(subset, alpha=0.6, label=sentiment, bins=20)
        plt.title('Word Count by Sentiment')
        plt.xlabel('Word Count')
        plt.ylabel('Frequency')
        plt.legend()
    
    # 6. Average word count by sentiment
    plt.subplot(2, 3, 6)
    if 'word_count' in df.columns:
        avg_words = df.groupby('sentiment')['word_count'].mean()
        plt.bar(avg_words.index, avg_words.values)
        plt.title('Average Word Count by Sentiment')
        plt.xlabel('Sentiment')
        plt.ylabel('Average Word Count')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("\n=== DATASET STATISTICS ===")
    print(f"Total reviews: {len(df)}")
    print(f"Unique products: {df['product'].nunique() if 'product' in df.columns else 'N/A'}")
    
    if text_col:
        print(f"\nText length statistics:")
        print(df['text_length'].describe())
    
    if 'processed_text' in df.columns:
        print(f"\nProcessed text length statistics:")
        print(df['processed_length'].describe())
        
        print(f"\nWord count statistics:")
        print(df['word_count'].describe())
    
    print(f"\nSentiment distribution:")
    for sentiment, count in df['sentiment'].value_counts().items():
        percentage = (count / len(df)) * 100
        print(f"{sentiment}: {count} ({percentage:.1f}%)")

# Explore the data
explore_sentiment_data(df)

In [None]:
# MultinomialNB Sentiment Analysis Model
class SentimentAnalysisModel:
    def __init__(self, vectorizer_type='tfidf', max_features=5000, ngram_range=(1, 2)):
        """
        Initialize the sentiment analysis model
        
        Parameters:
        - vectorizer_type: 'tfidf' or 'count'
        - max_features: maximum number of features to extract
        - ngram_range: tuple indicating n-gram range (e.g., (1,1) for unigrams, (1,2) for unigrams+bigrams)
        """
        self.vectorizer_type = vectorizer_type
        self.max_features = max_features
        self.ngram_range = ngram_range
        
        # Initialize vectorizer
        if vectorizer_type == 'tfidf':
            self.vectorizer = TfidfVectorizer(
                max_features=max_features,
                ngram_range=ngram_range,
                stop_words=None,  # We already removed stopwords
                lowercase=False,  # Already lowercased
                token_pattern=r'\b\w+\b'
            )
        else:
            self.vectorizer = CountVectorizer(
                max_features=max_features,
                ngram_range=ngram_range,
                stop_words=None,
                lowercase=False,
                token_pattern=r'\b\w+\b'
            )
        
        # Initialize MultinomialNB
        self.model = MultinomialNB()
        
        # Create pipeline
        self.pipeline = Pipeline([
            ('vectorizer', self.vectorizer),
            ('classifier', self.model)
        ])
        
        self.is_trained = False
    
    def prepare_data(self, df, text_column='processed_text', target_column='sentiment'):
        """Prepare data for training"""
        # Filter out empty texts
        df_clean = df.dropna(subset=[text_column, target_column])
        df_clean = df_clean[df_clean[text_column].str.strip() != '']
        
        X = df_clean[text_column].values
        y = df_clean[target_column].values
        
        print(f"Prepared {len(X)} samples for training")
        print(f"Class distribution: {pd.Series(y).value_counts()}")
        
        return X, y
    
    def train(self, X, y, test_size=0.2, random_state=42):
        """Train the model"""
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
        
        print(f"Training set size: {len(X_train)}")
        print(f"Test set size: {len(X_test)}")
        
        # Train the model
        print("Training MultinomialNB model...")
        self.pipeline.fit(X_train, y_train)
        self.is_trained = True
        
        # Store test data for evaluation
        self.X_test = X_test
        self.y_test = y_test
        self.X_train = X_train
        self.y_train = y_train
        
        print("Training completed!")
        
        return X_train, X_test, y_train, y_test
    
    def evaluate(self):
        """Evaluate the model"""
        if not self.is_trained:
            print("Model is not trained yet!")
            return
        
        # Predictions on training set
        y_train_pred = self.pipeline.predict(self.X_train)
        train_accuracy = accuracy_score(self.y_train, y_train_pred)
        
        # Predictions on test set
        y_test_pred = self.pipeline.predict(self.X_test)
        test_accuracy = accuracy_score(self.y_test, y_test_pred)
        
        print("=== MODEL EVALUATION ===")
        print(f"Training Accuracy: {train_accuracy:.4f}")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        
        print(f"\n=== CLASSIFICATION REPORT (Test Set) ===")
        print(classification_report(self.y_test, y_test_pred))
        
        # Confusion Matrix
        print(f"\n=== CONFUSION MATRIX ===")
        cm = confusion_matrix(self.y_test, y_test_pred)
        print(cm)
        
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=self.pipeline.classes_,
                   yticklabels=self.pipeline.classes_)
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
        return {
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'classification_report': classification_report(self.y_test, y_test_pred, output_dict=True),
            'confusion_matrix': cm
        }
    
    def cross_validate(self, X, y, cv=5):
        """Perform cross-validation"""
        if not self.is_trained:
            print("Training model for cross-validation...")
            self.pipeline.fit(X, y)
        
        print(f"Performing {cv}-fold cross-validation...")
        cv_scores = cross_val_score(self.pipeline, X, y, cv=cv, scoring='accuracy')
        
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return cv_scores
    
    def predict(self, texts):
        """Make predictions on new texts"""
        if not self.is_trained:
            print("Model is not trained yet!")
            return None
        
        if isinstance(texts, str):
            texts = [texts]
        
        predictions = self.pipeline.predict(texts)
        probabilities = self.pipeline.predict_proba(texts)
        
        results = []
        for i, text in enumerate(texts):
            result = {
                'text': text,
                'predicted_sentiment': predictions[i],
                'probabilities': {
                    class_name: prob 
                    for class_name, prob in zip(self.pipeline.classes_, probabilities[i])
                }
            }
            results.append(result)
        
        return results
    
    def get_feature_importance(self, top_n=20):
        """Get most important features for each class"""
        if not self.is_trained:
            print("Model is not trained yet!")
            return
        
        feature_names = self.vectorizer.get_feature_names_out()
        classes = self.pipeline.classes_
        
        # Get feature log probabilities
        feature_log_prob = self.model.feature_log_prob_
        
        print("=== TOP FEATURES BY CLASS ===")
        for i, class_name in enumerate(classes):
            top_features_idx = feature_log_prob[i].argsort()[-top_n:][::-1]
            top_features = [(feature_names[idx], feature_log_prob[i][idx]) for idx in top_features_idx]
            
            print(f"\nTop {top_n} features for {class_name}:")
            for feature, prob in top_features:
                print(f"  {feature}: {prob:.4f}")

# Train the model if we have data
if len(df) > 0 and 'processed_text' in df.columns and 'sentiment' in df.columns:
    print("Initializing sentiment analysis model...")
    model = SentimentAnalysisModel(vectorizer_type='tfidf', max_features=3000, ngram_range=(1, 2))
    
    # Prepare data
    X, y = model.prepare_data(df)
    
    if len(X) > 0:
        # Train the model
        X_train, X_test, y_train, y_test = model.train(X, y)
        
        # Evaluate the model
        evaluation_results = model.evaluate()
        
        # Cross-validation
        cv_scores = model.cross_validate(X, y)
        
        # Show feature importance
        model.get_feature_importance(top_n=15)
    else:
        print("No valid data available for training")
else:
    print("Required columns not found. Please run previous cells first.")

In [None]:
# Model Testing and Hyperparameter Tuning
def test_model_predictions(model, preprocessor):
    """Test model with sample predictions"""
    if not model.is_trained:
        print("Model is not trained yet!")
        return
    
    # Sample test texts (in Indonesian)
    test_texts = [
        "Produk ini sangat bagus dan membuat kulit saya lembut sekali",
        "Saya tidak suka produk ini karena membuat kulit berminyak",
        "Produk biasa saja, tidak ada perubahan yang signifikan",
        "Amazing product! Highly recommended for sensitive skin",
        "Terrible product, caused breakout on my skin",
        "Moisturizer ini cocok banget untuk kulit kering saya",
        "Gak cocok di kulit saya, malah jadi bruntusan",
        "Harga terjangkau dan kualitas bagus",
        "Overpriced untuk kualitas yang biasa aja",
        "Love this gel moisturizer, very hydrating"
    ]
    
    print("=== TESTING MODEL WITH SAMPLE TEXTS ===")
    for i, text in enumerate(test_texts, 1):
        # Preprocess text
        processed_text = preprocessor.preprocess(text)
        
        # Get prediction
        results = model.predict([processed_text])
        result = results[0]
        
        print(f"\n{i}. Original: {text}")
        print(f"   Processed: {processed_text}")
        print(f"   Predicted: {result['predicted_sentiment']}")
        print(f"   Probabilities: {result['probabilities']}")

def hyperparameter_tuning(X, y):
    """Perform hyperparameter tuning for the model"""
    print("=== HYPERPARAMETER TUNING ===")
    
    # Define parameter grid
    param_grid = {
        'vectorizer__max_features': [1000, 3000, 5000],
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'classifier__alpha': [0.1, 0.5, 1.0, 2.0]
    }
    
    # Create pipeline for grid search
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words=None, lowercase=False)),
        ('classifier', MultinomialNB())
    ])
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=3,  # Use 3-fold CV for faster execution
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    print("Starting grid search...")
    grid_search.fit(X, y)
    
    print(f"\nBest parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Train model with best parameters
    best_model = grid_search.best_estimator_
    
    return best_model, grid_search.best_params_

def analyze_misclassifications(model):
    """Analyze misclassified examples"""
    if not model.is_trained:
        print("Model is not trained yet!")
        return
    
    # Get predictions
    y_pred = model.pipeline.predict(model.X_test)
    
    # Find misclassified examples
    misclassified_idx = []
    for i, (true_label, pred_label) in enumerate(zip(model.y_test, y_pred)):
        if true_label != pred_label:
            misclassified_idx.append(i)
    
    print(f"=== MISCLASSIFICATION ANALYSIS ===")
    print(f"Total misclassified: {len(misclassified_idx)} out of {len(model.y_test)}")
    
    # Show some examples
    print(f"\nSample misclassified examples:")
    for i, idx in enumerate(misclassified_idx[:10]):  # Show first 10
        text = model.X_test[idx]
        true_label = model.y_test[idx]
        pred_label = y_pred[idx]
        
        print(f"\n{i+1}. Text: {text[:100]}...")
        print(f"   True: {true_label}, Predicted: {pred_label}")

# Run additional tests if model is available
if 'model' in locals() and model.is_trained:
    # Test with sample predictions
    test_model_predictions(model, preprocessor)
    
    # Analyze misclassifications
    analyze_misclassifications(model)
    
    # Hyperparameter tuning (commented out to save time, uncomment if needed)
    print("\n" + "="*50)
    print("HYPERPARAMETER TUNING")
    print("This may take several minutes...")
    
    # Uncomment the lines below to run hyperparameter tuning
    # best_model, best_params = hyperparameter_tuning(X, y)
    # print(f"Best model accuracy: {best_model.score(model.X_test, model.y_test):.4f}")
    
    print("\nNote: Hyperparameter tuning is commented out to save time.")
    print("Uncomment the lines above to run it.")
else:
    print("Model not available. Please run the training cell first.")

In [None]:
# Model Saving and Final Analysis
import pickle
import json
from datetime import datetime

def save_model_and_results(model, evaluation_results, cv_scores, filename_prefix='sentiment_model'):
    """Save the trained model and results"""
    if not model.is_trained:
        print("Model is not trained yet!")
        return
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save the model
    model_filename = f"{filename_prefix}_{timestamp}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved as: {model_filename}")
    
    # Save evaluation results
    results = {
        'timestamp': timestamp,
        'model_type': 'MultinomialNB',
        'vectorizer_type': model.vectorizer_type,
        'max_features': model.max_features,
        'ngram_range': model.ngram_range,
        'train_accuracy': evaluation_results['train_accuracy'],
        'test_accuracy': evaluation_results['test_accuracy'],
        'classification_report': evaluation_results['classification_report'],
        'cv_scores': cv_scores.tolist(),
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    results_filename = f"evaluation_results_{timestamp}.json"
    with open(results_filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"Evaluation results saved as: {results_filename}")
    
    return model_filename, results_filename

def load_model(filename):
    """Load a saved model"""
    try:
        with open(filename, 'rb') as f:
            model = pickle.load(f)
        print(f"Model loaded from: {filename}")
        return model
    except FileNotFoundError:
        print(f"Model file {filename} not found!")
        return None

def create_sentiment_analysis_report(df, model, evaluation_results, cv_scores):
    """Create a comprehensive analysis report"""
    report = []
    report.append("="*60)
    report.append("SENTIMENT ANALYSIS REPORT")
    report.append("="*60)
    report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("")
    
    # Dataset Statistics
    report.append("DATASET STATISTICS:")
    report.append(f"- Total reviews: {len(df)}")
    report.append(f"- Unique products: {df['product'].nunique() if 'product' in df.columns else 'N/A'}")
    
    if 'sentiment' in df.columns:
        sentiment_dist = df['sentiment'].value_counts()
        report.append("- Sentiment distribution:")
        for sentiment, count in sentiment_dist.items():
            percentage = (count / len(df)) * 100
            report.append(f"  {sentiment}: {count} ({percentage:.1f}%)")
    
    report.append("")
    
    # Model Configuration
    report.append("MODEL CONFIGURATION:")
    report.append(f"- Algorithm: MultinomialNB")
    report.append(f"- Vectorizer: {model.vectorizer_type.upper()}")
    report.append(f"- Max features: {model.max_features}")
    report.append(f"- N-gram range: {model.ngram_range}")
    report.append("")
    
    # Model Performance
    report.append("MODEL PERFORMANCE:")
    report.append(f"- Training accuracy: {evaluation_results['train_accuracy']:.4f}")
    report.append(f"- Test accuracy: {evaluation_results['test_accuracy']:.4f}")
    report.append(f"- Cross-validation mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    report.append("")
    
    # Classification Report
    report.append("DETAILED CLASSIFICATION METRICS:")
    class_report = evaluation_results['classification_report']
    for class_name in ['negative', 'neutral', 'positive']:
        if class_name in class_report:
            metrics = class_report[class_name]
            report.append(f"{class_name.upper()}:")
            report.append(f"  Precision: {metrics['precision']:.4f}")
            report.append(f"  Recall: {metrics['recall']:.4f}")
            report.append(f"  F1-score: {metrics['f1-score']:.4f}")
            report.append(f"  Support: {metrics['support']}")
            report.append("")
    
    # Overall metrics
    if 'macro avg' in class_report:
        macro_avg = class_report['macro avg']
        report.append("MACRO AVERAGE:")
        report.append(f"  Precision: {macro_avg['precision']:.4f}")
        report.append(f"  Recall: {macro_avg['recall']:.4f}")
        report.append(f"  F1-score: {macro_avg['f1-score']:.4f}")
        report.append("")
    
    # Recommendations
    report.append("RECOMMENDATIONS:")
    test_acc = evaluation_results['test_accuracy']
    if test_acc >= 0.8:
        report.append("- Model performance is excellent (>80% accuracy)")
    elif test_acc >= 0.7:
        report.append("- Model performance is good (70-80% accuracy)")
    elif test_acc >= 0.6:
        report.append("- Model performance is acceptable (60-70% accuracy)")
        report.append("- Consider collecting more training data")
        report.append("- Try different preprocessing techniques")
    else:
        report.append("- Model performance needs improvement (<60% accuracy)")
        report.append("- Collect more diverse training data")
        report.append("- Review preprocessing and feature extraction")
        report.append("- Consider different algorithms or ensemble methods")
    
    report.append("")
    report.append("="*60)
    
    # Save report
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_filename = f"sentiment_analysis_report_{timestamp}.txt"
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report))
    
    print(f"Analysis report saved as: {report_filename}")
    
    # Print report
    print('\n'.join(report))
    
    return report_filename

# Execute final analysis if model is available
if 'model' in locals() and 'evaluation_results' in locals() and 'cv_scores' in locals():
    print("Creating final analysis and saving results...")
    
    # Save model and results
    model_file, results_file = save_model_and_results(model, evaluation_results, cv_scores)
    
    # Create comprehensive report
    report_file = create_sentiment_analysis_report(df, model, evaluation_results, cv_scores)
    
    print(f"\n=== FILES CREATED ===")
    print(f"1. Model file: {model_file}")
    print(f"2. Results file: {results_file}")
    print(f"3. Analysis report: {report_file}")
    
    print(f"\n=== NEXT STEPS ===")
    print("1. Review the analysis report for insights")
    print("2. Use the saved model for future predictions")
    print("3. Consider collecting more data if accuracy is low")
    print("4. Experiment with different preprocessing techniques")
    print("5. Try hyperparameter tuning for better performance")
    
else:
    print("Please run all previous cells to train the model first.")