Import Libraries and Download NLTK Data

/**
** Author : Jayaprakash
**/

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

Define Text Preprocessing Function : 
This function cleans and preprocesses the text data by removing non-alphabetic characters, converting to lowercase, removing stopwords, and lemmatizing.

In [22]:
def text_preprocess(ds: pd.Series) -> pd.Series:
    for i in range(len(ds)):
        # Keep only alphabetic characters and replace others with a space
        cleaned_text = re.sub('[^a-zA-Z]', ' ', ds[i])
        # Convert to lowercase and split into individual words
        words = cleaned_text.lower().split()
        # Remove stopwords from the list of words
        words = [word for word in words if word not in set(stopwords.words('english'))]

        # Initialize lemmatizer and apply lemmatization to each word
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words if len(word) > 1]

        # Join the processed words back into a single string
        processed_text = ' '.join(words)
        ds[i] = processed_text

    return ds

Define Data Loading and Preprocessing Function : 
This function loads the dataset from a file, fills any NaN values, and applies the text preprocessing function.

In [23]:
def load_and_preprocess_data(file_path):
    # Load the dataset from a tab-separated CSV file and specify column names
    data = pd.read_csv(file_path, sep='\t', header=None, names=['reviews', 'rating'])
    
    # Replace any missing values with 1
    data.fillna(1, inplace=True)
    
    # Apply text preprocessing to the 'reviews' column
    data['processed_reviews'] = text_preprocess(data['reviews'])
    
    return data

Define Classifier Training and Evaluation Function : 
This function vectorizes the text data, scales the features, trains multiple classifiers, and evaluates their performance.

In [24]:
def train_and_evaluate_classifiers(X_train, X_test, y_train, y_test):

    # Vectorize the text data using CountVectorizer
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Convert sparse matrices to dense arrays and scale the features
    X_train_dense = X_train_vec.toarray()
    X_test_dense = X_test_vec.toarray()
    scaler = StandardScaler(with_mean=False)
    X_train_scaled = scaler.fit_transform(X_train_dense)
    X_test_scaled = scaler.transform(X_test_dense)

    # Define the classifiers to be used
    classifiers = {
        'Naive Bayes': GaussianNB(),
        'SVM': SVC(kernel='rbf'),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }

    evaluation_results = {}

    # Train and evaluate each classifier
    for name, model in classifiers.items():
        # Use scaled data for classifiers that are sensitive to feature magnitude
        if name in ['Naive Bayes', 'KNN', 'SVM']:
            X_train_data = X_train_scaled
            X_test_data = X_test_scaled
        else:
            X_train_data = X_train_vec
            X_test_data = X_test_vec

        # Fit the model and make predictions
        model.fit(X_train_data, y_train)
        y_pred = model.predict(X_test_data)

        # Generate the classification report and store the results
        report = classification_report(y_test, y_pred, output_dict=True)
        evaluation_results[name] = report

    return evaluation_results


Define Dataset Processing Function : 
This function processes a single dataset by loading, preprocessing, splitting the data, and training/evaluating classifiers.

In [15]:
def process_dataset(file_path, dataset_name):
    print(f"\nProcessing the {dataset_name} dataset...")
    
    # Load and preprocess the dataset
    dataset = load_and_preprocess_data(file_path)
    X = dataset['processed_reviews']
    y = dataset['rating']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and evaluate the classifiers
    results = train_and_evaluate_classifiers(X_train, X_test, y_train, y_test)

    # Display the results for each classifier
    print(f"\nEvaluation results for {dataset_name}:")
    for clf_name, report in results.items():
        print(f"\nClassifier: {clf_name}")
        print(f"Accuracy: {report['accuracy']:.2f}")
        print(f"Macro Avg Precision: {report['macro avg']['precision']:.2f}")
        print(f"Macro Avg Recall: {report['macro avg']['recall']:.2f}")
        print(f"Macro Avg F1-Score: {report['macro avg']['f1-score']:.2f}")

    return results

Process All Datasets and Compare Results : 
This section processes all three datasets and compares the performance of classifiers across datasets.

In [None]:
import warnings
warnings.filterwarnings("ignore")

# List of datasets with corresponding names
datasets = [
    ('imdb_labelled.txt', 'IMDB'),
    ('amazon_cells_labelled.txt', 'Amazon'),
    ('yelp_labelled.txt', 'Yelp')
]

# Process each dataset and store results
all_results = {}
for file_path, dataset_name in datasets:
    all_results[dataset_name] = process_dataset(file_path, dataset_name)



Processing the IMDB dataset...

Evaluation results for IMDB:

Classifier: Naive Bayes
Accuracy: 0.65
Macro Avg Precision: 0.66
Macro Avg Recall: 0.64
Macro Avg F1-Score: 0.64

Classifier: SVM
Accuracy: 0.66
Macro Avg Precision: 0.72
Macro Avg Recall: 0.66
Macro Avg F1-Score: 0.63

Classifier: KNN
Accuracy: 0.55
Macro Avg Precision: 0.69
Macro Avg Recall: 0.56
Macro Avg F1-Score: 0.46

Classifier: Random Forest
Accuracy: 0.69
Macro Avg Precision: 0.72
Macro Avg Recall: 0.70
Macro Avg F1-Score: 0.69

Classifier: Decision Tree
Accuracy: 0.65
Macro Avg Precision: 0.66
Macro Avg Recall: 0.65
Macro Avg F1-Score: 0.65

Processing the Amazon dataset...

Evaluation results for Amazon:

Classifier: Naive Bayes
Accuracy: 0.69
Macro Avg Precision: 0.69
Macro Avg Recall: 0.69
Macro Avg F1-Score: 0.69

Classifier: SVM
Accuracy: 0.74
Macro Avg Precision: 0.75
Macro Avg Recall: 0.74
Macro Avg F1-Score: 0.74

Classifier: KNN
Accuracy: 0.70
Macro Avg Precision: 0.72
Macro Avg Recall: 0.69
Macro Avg F1-

Compare Performances across Datasets

In [19]:
print("\n\nComparison of classifier performance across datasets:")
for clf_name in all_results['IMDB'].keys():
    print(f"\n{clf_name}:")
    for dataset_name in all_results.keys():
        accuracy = all_results[dataset_name][clf_name]['accuracy']
        precision=all_results[dataset_name][clf_name]['macro avg']['precision']
        recall = all_results[dataset_name][clf_name]['macro avg']['recall']
        f1_score = all_results[dataset_name][clf_name]['macro avg']['f1-score']
        print(f"  {dataset_name:<10} - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1_score:.2f}")



Comparison of classifier performance across datasets:

Naive Bayes:
  IMDB       - Accuracy: 0.65, Precision: 0.66, Recall: 0.64, F1-score: 0.64
  Amazon     - Accuracy: 0.69, Precision: 0.69, Recall: 0.69, F1-score: 0.69
  Yelp       - Accuracy: 0.67, Precision: 0.68, Recall: 0.66, F1-score: 0.65

SVM:
  IMDB       - Accuracy: 0.66, Precision: 0.72, Recall: 0.66, F1-score: 0.63
  Amazon     - Accuracy: 0.74, Precision: 0.75, Recall: 0.74, F1-score: 0.74
  Yelp       - Accuracy: 0.69, Precision: 0.70, Recall: 0.68, F1-score: 0.68

KNN:
  IMDB       - Accuracy: 0.55, Precision: 0.69, Recall: 0.56, F1-score: 0.46
  Amazon     - Accuracy: 0.70, Precision: 0.72, Recall: 0.69, F1-score: 0.69
  Yelp       - Accuracy: 0.62, Precision: 0.63, Recall: 0.63, F1-score: 0.62

Random Forest:
  IMDB       - Accuracy: 0.69, Precision: 0.72, Recall: 0.70, F1-score: 0.69
  Amazon     - Accuracy: 0.79, Precision: 0.79, Recall: 0.79, F1-score: 0.78
  Yelp       - Accuracy: 0.70, Precision: 0.72, Recall: