In [2]:
!pip install fireducks --quiet


In [3]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)

Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [4]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")

Combined data saved at: /kaggle/working/combined_processed_data.csv


In [5]:
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack truly like soundtrack enj...  
4  remember pull Jaw Floor hear play g

In [6]:
# Check label distribution
print(combined_data['label'].value_counts())

label
2    1800000
1    1800000
Name: count, dtype: int64


In [7]:
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (90% training, 10% testing) with a reduced test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

# Use HashingVectorizer for incremental vectorization
hash_vectorizer = HashingVectorizer(n_features=5000, alternate_sign=False, ngram_range=(1, 2))

# Batch processing function for HashingVectorizer
def batch_hash_transform(vectorizer, data, batch_size=100000):
    """Batch process large datasets for HashingVectorizer transformation."""
    batches = []
    total = len(data)
    for i in range(0, total, batch_size):
        print(f"Processing batch {i // batch_size + 1} / {total // batch_size + 1}")
        batch = data[i: i + batch_size].values.astype('U')  # Convert to Unicode
        batches.append(vectorizer.transform(batch))
    return vstack(batches)

# Transform training data in batches
print("Starting HashingVectorizer transformation on training data...")
X_train_tfidf = batch_hash_transform(hash_vectorizer, X_train, batch_size=50000)

# Transform testing data in batches
print("Starting HashingVectorizer transformation on testing data...")
X_test_tfidf = batch_hash_transform(hash_vectorizer, X_test, batch_size=50000)

print("HashingVectorizer transformation complete.")
print(f"Training Hashing shape: {X_train_tfidf.shape}")
print(f"Testing Hashing shape: {X_test_tfidf.shape}")

Training samples: 3240000
Testing samples: 360000
Starting HashingVectorizer transformation on training data...
Processing batch 1 / 65
Processing batch 2 / 65
Processing batch 3 / 65
Processing batch 4 / 65
Processing batch 5 / 65
Processing batch 6 / 65
Processing batch 7 / 65
Processing batch 8 / 65
Processing batch 9 / 65
Processing batch 10 / 65
Processing batch 11 / 65
Processing batch 12 / 65
Processing batch 13 / 65
Processing batch 14 / 65
Processing batch 15 / 65
Processing batch 16 / 65
Processing batch 17 / 65
Processing batch 18 / 65
Processing batch 19 / 65
Processing batch 20 / 65
Processing batch 21 / 65
Processing batch 22 / 65
Processing batch 23 / 65
Processing batch 24 / 65
Processing batch 25 / 65
Processing batch 26 / 65
Processing batch 27 / 65
Processing batch 28 / 65
Processing batch 29 / 65
Processing batch 30 / 65
Processing batch 31 / 65
Processing batch 32 / 65
Processing batch 33 / 65
Processing batch 34 / 65
Processing batch 35 / 65
Processing batch 36 / 

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Define all models (using GPU where applicable)
models = [
    {'name': 'Naive Bayes', 'classifier': MultinomialNB(alpha=1.0)},  # CPU only
    {'name': 'SVM', 'classifier': LinearSVC(C=1.0, max_iter=5000)},   # CPU only
    {'name': 'XGBoost', 'classifier': xgb.XGBClassifier(
        use_label_encoder=False, 
        eval_metric='logloss',
        device='cuda'  # Enable GPU
    )},
    {'name': 'LightGBM', 'classifier': lgb.LGBMClassifier(
        device='gpu',  # Enable GPU
        gpu_platform_id=0,  # Default platform
        gpu_device_id=0     # Default device
    )},
    {'name': 'Random Forest', 'classifier': RandomForestClassifier(n_estimators=100)},  # CPU only
    {'name': 'Logistic Regression', 'classifier': LogisticRegression(max_iter=1000)},  # CPU only
    {'name': 'AdaBoost', 'classifier': AdaBoostClassifier()},  # CPU only
    {'name': 'MLP Classifier', 'classifier': MLPClassifier(max_iter=500), 'needs_dense': True}  # CPU only
]

# Train and evaluate each model
results = {}
for model_info in models:
    name = model_info['name']
    classifier = model_info['classifier']
    needs_dense = model_info.get('needs_dense', False)
    
    print(f"\nTraining {name}...")
    # Adjust labels for training (convert [1, 2] to [0, 1])
    y_train_adjusted = y_train - 1
    y_test_adjusted = y_test - 1
    
    if needs_dense:
        classifier.fit(X_train_tfidf.toarray(), y_train_adjusted)
        y_pred_adjusted = classifier.predict(X_test_tfidf.toarray())
    else:
        classifier.fit(X_train_tfidf, y_train_adjusted)
        y_pred_adjusted = classifier.predict(X_test_tfidf)
    
    # Convert predictions back to original labels [1, 2]
    y_pred = y_pred_adjusted + 1
    
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    results[name] = {'y_pred': y_pred, 'accuracy': accuracy}


Training Naive Bayes...
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           1       0.81      0.79      0.80    180086
           2       0.80      0.82      0.81    179914

    accuracy                           0.81    360000
   macro avg       0.81      0.81      0.81    360000
weighted avg       0.81      0.81      0.81    360000

Naive Bayes Accuracy: 0.8057

Training SVM...
SVM Classification Report:
              precision    recall  f1-score   support

           1       0.83      0.83      0.83    180086
           2       0.83      0.83      0.83    179914

    accuracy                           0.83    360000
   macro avg       0.83      0.83      0.83    360000
weighted avg       0.83      0.83      0.83    360000

SVM Accuracy: 0.8331

Training XGBoost...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




XGBoost Classification Report:
              precision    recall  f1-score   support

           1       0.82      0.83      0.82    180086
           2       0.82      0.81      0.82    179914

    accuracy                           0.82    360000
   macro avg       0.82      0.82      0.82    360000
weighted avg       0.82      0.82      0.82    360000

XGBoost Accuracy: 0.8199

Training LightGBM...
[LightGBM] [Info] Number of positive: 1620086, number of negative: 1619914
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 785756
[LightGBM] [Info] Number of data points in the train set: 3240000, number of used features: 5000
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500027 -> initscore=0.000106
[LightGBM] [Info] Start t

In [None]:
import joblib

# Example: Save the SVM model (adjust based on Cell-7 or Cell-12 results)
best_model = LinearSVC(C=1.0, max_iter=5000)
best_model.fit(X_train_tfidf, y_train)  # Original labels [1, 2]
joblib.dump(best_model, "best_model.pkl")
print("Best model saved as 'best_model.pkl'")

In [None]:
# Load the saved model
loaded_model = joblib.load("best_model.pkl")

# Predict on test data (no adjustment needed if model was trained on [1, 2])
new_predictions = loaded_model.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report for Best Model:")
print(classification_report(y_test, new_predictions))

accuracy = accuracy_score(y_test, new_predictions)
print(f"Accuracy of Best Model: {accuracy:.4f}")

In [None]:
import spacy

# Load Spacy model for preprocessing (optional, not used in current pipeline)
nlp = spacy.load("en_core_web_sm")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import vstack
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Define all models (same as Cell-7, with GPU for XGBoost and LightGBM)
models = [
    {'name': 'Naive Bayes', 'classifier': MultinomialNB(alpha=1.0)},
    {'name': 'SVM', 'classifier': LinearSVC(C=1.0, max_iter=5000)},
    {'name': 'XGBoost', 'classifier': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', device='cuda')},
    {'name': 'LightGBM', 'classifier': lgb.LGBMClassifier(device='gpu', gpu_platform_id=0, gpu_device_id=0)},
    {'name': 'Random Forest', 'classifier': RandomForestClassifier(n_estimators=100)},
    {'name': 'Logistic Regression', 'classifier': LogisticRegression(max_iter=1000)},
    {'name': 'AdaBoost', 'classifier': AdaBoostClassifier()},
    {'name': 'MLP Classifier', 'classifier': MLPClassifier(max_iter=500), 'needs_dense': True}
]

# Number of runs for statistical analysis
n_runs = 10
accuracies = {model['name']: [] for model in models}

for run in range(n_runs):
    print(f"\nRun {run + 1}/{n_runs}")
    # New random split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=run)
    
    # Transform data
    X_train_tfidf = batch_hash_transform(hash_vectorizer, X_train, batch_size=50000)
    X_test_tfidf = batch_hash_transform(hash_vectorizer, X_test, batch_size=50000)
    
    # Adjust labels for training (convert [1, 2] to [0, 1])
    y_train_adjusted = y_train - 1
    y_test_adjusted = y_test - 1
    
    # Train and evaluate each model
    for model_info in models:
        name = model_info['name']
        classifier = model_info['classifier'].__class__()  # New instance
        if name in ['XGBoost']:
            classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', device='cuda')
        elif name in ['LightGBM']:
            classifier = lgb.LGBMClassifier(device='gpu', gpu_platform_id=0, gpu_device_id=0)
        needs_dense = model_info.get('needs_dense', False)
        
        if needs_dense:
            classifier.fit(X_train_tfidf.toarray(), y_train_adjusted)
            y_pred_adjusted = classifier.predict(X_test_tfidf.toarray())
        else:
            classifier.fit(X_train_tfidf, y_train_adjusted)
            y_pred_adjusted = classifier.predict(X_test_tfidf)
        
        # Convert predictions back to [1, 2]
        y_pred = y_pred_adjusted + 1
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[name].append(accuracy)
        print(f"{name} Accuracy (Run {run + 1}): {accuracy:.4f}")

# Compute mean and standard deviation
for name in accuracies:
    mean_acc = np.mean(accuracies[name])
    std_acc = np.std(accuracies[name])
    print(f"\n{name}: Mean Accuracy = {mean_acc:.4f}, Std Dev = {std_acc:.4f}")

In [None]:
# Calculate 95% confidence intervals
confidence_intervals = {}
for name in accuracies:
    mean_acc = np.mean(accuracies[name])
    std_acc = np.std(accuracies[name])
    se = std_acc / np.sqrt(n_runs)  # Standard error
    ci_lower = mean_acc - 1.96 * se
    ci_upper = mean_acc + 1.96 * se
    confidence_intervals[name] = (ci_lower, ci_upper)
    print(f"{name}: 95% CI = [{ci_lower:.4f}, {ci_upper:.4f}]")

In [None]:
from scipy.stats import wilcoxon

# Perform Wilcoxon test for each pair of models
model_names = list(accuracies.keys())
for i in range(len(model_names)):
    for j in range(i + 1, len(model_names)):
        model1, model2 = model_names[i], model_names[j]
        stat, p_value = wilcoxon(accuracies[model1], accuracies[model2])
        print(f"\nWilcoxon Test ({model1} vs {model2}):")
        print(f"Statistic = {stat}, p-value = {p_value:.4f}")
        if p_value < 0.05:
            print(f"Significant difference between {model1} and {model2}")
        else:
            print(f"No significant difference between {model1} and {model2}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.boxplot(accuracies.values(), labels=accuracies.keys())
plt.title("Performance Distribution Across Models")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
means = [np.mean(accuracies[name]) for name in accuracies]
ci_lowers = [confidence_intervals[name][0] for name in accuracies]
ci_uppers = [confidence_intervals[name][1] for name in accuracies]
yerr = [(m - l, u - m) for m, l, u in zip(means, ci_lowers, ci_uppers)]

plt.bar(accuracies.keys(), means, yerr=np.array(yerr).T, capsize=5, color='skyblue')
plt.title("Mean Accuracy with 95% Confidence Intervals")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns

# Compute p-values matrix
p_values = np.ones((len(model_names), len(model_names)))
for i in range(len(model_names)):
    for j in range(i + 1, len(model_names)):
        _, p_value = wilcoxon(accuracies[model_names[i]], accuracies[model_names[j]])
        p_values[i, j] = p_value
        p_values[j, i] = p_value

plt.figure(figsize=(10, 8))
sns.heatmap(p_values, annot=True, fmt=".4f", cmap="YlGnBu", xticklabels=model_names, yticklabels=model_names)
plt.title("Pairwise Wilcoxon Test p-values")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support

# Store per-class metrics
metrics = {model['name']: {'precision_1': [], 'recall_1': [], 'f1_1': [], 'precision_2': [], 'recall_2': [], 'f1_2': []} for model in models}

for run in range(n_runs):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=run)
    X_train_tfidf = batch_hash_transform(hash_vectorizer, X_train, batch_size=50000)
    X_test_tfidf = batch_hash_transform(hash_vectorizer, X_test, batch_size=50000)
    y_train_adjusted = y_train - 1
    y_test_adjusted = y_test - 1
    
    for model_info in models:
        name = model_info['name']
        classifier = model_info['classifier'].__class__()
        if name == 'XGBoost':
            classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', device='cuda')
        elif name == 'LightGBM':
            classifier = lgb.LGBMClassifier(device='gpu', gpu_platform_id=0, gpu_device_id=0)
        needs_dense = model_info.get('needs_dense', False)
        
        if needs_dense:
            classifier.fit(X_train_tfidf.toarray(), y_train_adjusted)
            y_pred_adjusted = classifier.predict(X_test_tfidf.toarray())
        else:
            classifier.fit(X_train_tfidf, y_train_adjusted)
            y_pred_adjusted = classifier.predict(X_test_tfidf)
        
        y_pred = y_pred_adjusted + 1
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[1, 2])
        metrics[name]['precision_1'].append(precision[0])
        metrics[name]['recall_1'].append(recall[0])
        metrics[name]['f1_1'].append(f1[0])
        metrics[name]['precision_2'].append(precision[1])
        metrics[name]['recall_2'].append(recall[1])
        metrics[name]['f1_2'].append(f1[1])

# Summarize and print
for name in metrics:
    print(f"\n{name} Per-Class Metrics (Mean ± Std):")
    print(f"Class 1 - Precision: {np.mean(metrics[name]['precision_1']):.4f} ± {np.std(metrics[name]['precision_1']):.4f}")
    print(f"Class 1 - Recall: {np.mean(metrics[name]['recall_1']):.4f} ± {np.std(metrics[name]['recall_1']):.4f}")
    print(f"Class 1 - F1: {np.mean(metrics[name]['f1_1']):.4f} ± {np.std(metrics[name]['f1_1']):.4f}")
    print(f"Class 2 - Precision: {np.mean(metrics[name]['precision_2']):.4f} ± {np.std(metrics[name]['precision_2']):.4f}")
    print(f"Class 2 - Recall: {np.mean(metrics[name]['recall_2']):.4f} ± {np.std(metrics[name]['recall_2']):.4f}")
    print(f"Class 2 - F1: {np.mean(metrics[name]['f1_2']):.4f} ± {np.std(metrics[name]['f1_2']):.4f}")

In [None]:
plt.figure(figsize=(14, 6))
bar_width = 0.35
index = np.arange(len(models))

f1_class1 = [np.mean(metrics[name]['f1_1']) for name in metrics]
f1_class2 = [np.mean(metrics[name]['f1_2']) for name in metrics]

plt.bar(index, f1_class1, bar_width, label='Class 1 (Neutral)', color='blue')
plt.bar(index + bar_width, f1_class2, bar_width, label='Class 2 (Positive)', color='orange')

plt.xlabel('Model')
plt.ylabel('Mean F1 Score')
plt.title('Per-Class F1 Score Across Models')
plt.xticks(index + bar_width / 2, [model['name'] for model in models], rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()