# Sentiment Analysis Project

This notebook implements sentiment analysis on review data using multiple approaches:
1. Different text vectorization methods (TF-IDF, Word2Vec, BERT)
2. Multiple classification models with hyperparameter tuning
3. Performance evaluation using F1 score

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import nltk
import re
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('stopwords', quiet=True)

  from .autonotebook import tqdm as notebook_tqdm


True

## 2. Text Preprocessing Functions

In [2]:
def preprocess_text(text):
    """
    Preprocess text by:
    1. Converting to lowercase
    2. Removing punctuation
    3. Removing stop words
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]

    return ' '.join(filtered_words)

## 3. Data Loading and Preprocessing

In [3]:
def load_and_preprocess_data(file_path):
    """
    Load CSV file and preprocess the data
    """
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Clean and convert ratings to binary (positive/negative)
    def convert_rating(x):
        try:
            # Remove any non-numeric characters and convert to float
            cleaned_rating = ''.join(char for char in str(x) if char.isdigit() or char == '.')
            if cleaned_rating:
                return 1 if float(cleaned_rating) > 3 else 0
            return 0  # default to negative if no valid number found
        except:
            return 0  # default to negative for any conversion errors

    df['sentiment'] = df['rating'].apply(convert_rating)

    # Preprocess reviews
    df['processed_review'] = df['review'].apply(preprocess_text)

    return df

# Load and preprocess data
df = load_and_preprocess_data('train_sentiment.csv')
print("Data shape:", df.shape)
df.head()

Data shape: (1465, 5)


Unnamed: 0.1,Unnamed: 0,review,rating,sentiment,processed_review
0,0,Looks durable Charging is fine tooNo complains...,4.2,1,looks durable charging fine toono complainscha...
1,1,I ordered this cable to connect my phone to An...,4.0,1,ordered cable connect phone android auto car c...
2,2,"Not quite durable and sturdy,https://m.media-a...",3.9,1,quite durable sturdyhttpsmmediaamazoncomimages...
3,3,"Good product,long wire,Charges good,Nice,I bou...",4.2,1,good productlong wirecharges goodnicei bought ...
4,4,"Bought this instead of original apple, does th...",4.2,1,bought instead original apple work 150rs fast ...


## 4. Text Vectorization Methods

In [4]:
def vectorize_text(processed_reviews, vectorizer_type='tfidf'):
    """
    Vectorize preprocessed text using different methods:
    - TF-IDF: Term Frequency-Inverse Document Frequency
    - Word2Vec: Word embeddings using gensim
    - BERT: Sentence embeddings using sentence-transformers
    """
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(max_features=5000)
        X = vectorizer.fit_transform(processed_reviews)
        return X, vectorizer

    elif vectorizer_type == 'word2vec':
        # Convert text to list of words for Word2Vec
        sentences = [text.split() for text in processed_reviews]

        # Train Word2Vec model
        model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

        # Create document vectors by averaging word vectors
        X = np.array([
            np.mean([model.wv[word] for word in text.split() if word in model.wv]
                   or [np.zeros(100)], axis=0)
            for text in processed_reviews
        ])
        return X, model

    elif vectorizer_type == 'bert':
        # Load BERT model
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

        # Generate embeddings
        X = model.encode(processed_reviews.tolist(), show_progress_bar=True)
        return X, model

    else:
        raise ValueError("Invalid vectorization type. Choose 'tfidf', 'word2vec', or 'bert'")

## 5. Model Training and Evaluation Functions

In [5]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate multiple models with hyperparameter tuning
    """
    models = {
        'logistic': {
            'model': LogisticRegression(),
            'params': {
                'C': [0.1, 1, 10],
                'max_iter': [1000],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear']
            }
        },
        'random_forest': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        'knn': {
            'model': KNeighborsClassifier(),
            'params': {
                'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance'],
                'metric': ['euclidean', 'manhattan']
            }
        }
    }

    results = {}

    for name, config in models.items():
        print(f"\nTraining {name}...")
        grid_search = GridSearchCV(
            config['model'],
            config['params'],
            cv=5,
            scoring='f1',
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)

        # Get best model
        best_model = grid_search.best_estimator_

        # Make predictions
        y_pred = best_model.predict(X_test)

        # Calculate F1 score
        f1 = f1_score(y_test, y_pred)

        results[name] = {
            'model': best_model,
            'best_params': grid_search.best_params_,
            'f1_score': f1,
            'classification_report': classification_report(y_test, y_pred)
        }

        print(f"{name} - Best parameters: {grid_search.best_params_}")
        print(f"{name} - F1 Score: {f1:.4f}")

    return results

## 6. Run the Complete Analysis

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_review'],
    df['sentiment'],
    test_size=0.3,
    random_state=42
)

# Try different vectorization methods
vectorization_methods = ['tfidf', 'word2vec', 'bert']

for method in vectorization_methods:
    print(f"\nUsing {method} vectorization...")

    # Vectorize data
    X_train_vectorized, vectorizer = vectorize_text(X_train, method)

    if method == 'tfidf':
        X_test_vectorized = vectorizer.transform(X_test)
    else:
        # For word2vec and bert, we need to generate test vectors differently
        X_test_vectorized, _ = vectorize_text(X_test, method)

    # Train and evaluate models
    results = train_and_evaluate_models(X_train_vectorized, X_test_vectorized, y_train, y_test)

    # Print best performing model for this vectorization method
    best_model = max(results.items(), key=lambda x: x[1]['f1_score'])
    print(f"\nBest model for {method} vectorization:")
    print(f"Model: {best_model[0]}")
    print(f"F1 Score: {best_model[1]['f1_score']:.4f}")
    print("Classification Report:")
    print(best_model[1]['classification_report'])


Using tfidf vectorization...

Training logistic...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


logistic - Best parameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
logistic - F1 Score: 0.9966

Training random_forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


random_forest - Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
random_forest - F1 Score: 0.9966

Training knn...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


knn - Best parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
knn - F1 Score: 0.9966

Best model for tfidf vectorization:
Model: logistic
F1 Score: 0.9966
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.99      1.00      1.00       437

    accuracy                           0.99       440
   macro avg       0.50      0.50      0.50       440
weighted avg       0.99      0.99      0.99       440


Using word2vec vectorization...

Training logistic...
logistic - Best parameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
logistic - F1 Score: 0.9966

Training random_forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


random_forest - Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
random_forest - F1 Score: 0.9744

Training knn...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


knn - Best parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
knn - F1 Score: 0.9966

Best model for word2vec vectorization:
Model: logistic
F1 Score: 0.9966
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.99      1.00      1.00       437

    accuracy                           0.99       440
   macro avg       0.50      0.50      0.50       440
weighted avg       0.99      0.99      0.99       440


Using bert vectorization...


Batches: 100%|██████████| 33/33 [00:16<00:00,  1.98it/s]
Batches: 100%|██████████| 14/14 [00:06<00:00,  2.08it/s]



Training logistic...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


logistic - Best parameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
logistic - F1 Score: 0.9966

Training random_forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


random_forest - Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
random_forest - F1 Score: 0.9966

Training knn...
knn - Best parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
knn - F1 Score: 0.9966

Best model for bert vectorization:
Model: logistic
F1 Score: 0.9966
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.99      1.00      1.00       437

    accuracy                           0.99       440
   macro avg       0.50      0.50      0.50       440
weighted avg       0.99      0.99      0.99       440



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
