In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [2]:

pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [3]:
from textblob import TextBlob
# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nihan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nihan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nihan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Load the dataset
df = pd.read_csv('Dataset/filtered_reviews.csv')

df.head(5)

Unnamed: 0,review_content,final_label
0,Mark the spy,4
1,Title,4
2,My account says its been temporarily locked. ...,3
3,Put on your tin foil hats to deflect the mind ...,1
4,Im so sick and tired of us working our tails ...,4


In [7]:
df['final_label'].value_counts()

final_label
4    2846
2    2428
1     567
3     434
Name: count, dtype: int64

In [8]:
pip install nlpaug


Note: you may need to restart the kernel to use updated packages.


In [9]:
import re

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation using regular expressions
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Tokenization using TextBlob
    tokens = TextBlob(text).words
    
    # Stemming using NLTK PorterStemmer
    porter_stemmer = PorterStemmer()
    tokens = [porter_stemmer.stem(word) for word in tokens]
    
    # Lemmatization using NLTK WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)



df['processed_review_content'] = df['review_content'].apply(preprocess_text)

In [10]:
df.to_csv('cleaned_reviews.csv', index=False)

In [11]:
import nlpaug.augmenter.word as naw

# Instantiate augmentation techniques
augmenter = naw.SynonymAug(aug_src='wordnet')




In [14]:
# Augment data for minority classes
augmented_data = []
for label in [1, 3]:
    minority_data = df[df['final_label'] == label]['processed_review_content']
    for text in minority_data:
        augmented_text = augmenter.augment(text)
        augmented_data.append({'processed_review_content': augmented_text, 'final_label': label})



In [15]:
# Combine original and augmented data
balanced_data = pd.concat([df, pd.DataFrame(augmented_data)])



In [16]:
# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

# Check the class distribution
print(balanced_data['final_label'].value_counts())

# Now, use balanced_data for training your model

final_label
4    2846
2    2428
1    1134
3     868
Name: count, dtype: int64


In [12]:
import nlpaug.augmenter.word as naw

# Instantiate augmentation techniques
augmenter = naw.SynonymAug(aug_src='wordnet')

# Augment data for minority classes
augmented_data = []
for label in [1, 3]:
    minority_data = df[df['final_label'] == label]['processed_review_content']
    # Determine the augmentation factor based on the ratio of minority to majority class
    augmentation_factor = int((df['final_label'].value_counts().max() / len(minority_data)) - 1)
    for text in minority_data:
        for _ in range(augmentation_factor):
            augmented_text = augmenter.augment(text)
            augmented_data.append({'processed_review_content': augmented_text, 'final_label': label})

# Combine original and augmented data
balanced_data = pd.concat([df, pd.DataFrame(augmented_data)])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

# Check the class distribution
print(balanced_data['final_label'].value_counts())

# Now, use balanced_data for training your model


final_label
4    2846
1    2835
3    2604
2    2428
Name: count, dtype: int64


In [13]:
balanced_data.to_csv('balanced_data_with_value_counts.csv', index=False)


In [14]:
balanced_data.head()

Unnamed: 0,review_content,final_label,processed_review_content
0,,3,[open facebook account earli 2000 instantli fe...
1,,1,[say your gon sodium get facebook first time y...
2,"When they updated this app, suddenly it beca...",4,updat app suddenli becam difficult access phot...
3,(?????????)Imagine this: you want to use Twitt...,4,imagin want use twitter youd rather data colle...
4,,1,[woke facebook woke much detector trail spi li...


In [6]:
df2 = pd.read_csv('Dataset/balanced_data_with_value_counts.csv')

In [2]:
pip install wordcloud


Collecting wordcloud
  Downloading wordcloud-1.9.3-cp310-cp310-win_amd64.whl.metadata (3.5 kB)
Downloading wordcloud-1.9.3-cp310-cp310-win_amd64.whl (299 kB)
   ---------------------------------------- 0.0/300.0 kB ? eta -:--:--
   --- ----------------------------------- 30.7/300.0 kB 640.0 kB/s eta 0:00:01
   ------------ --------------------------- 92.2/300.0 kB 1.1 MB/s eta 0:00:01
   ------------------- -------------------- 143.4/300.0 kB 1.2 MB/s eta 0:00:01
   -------------------------------- ------- 245.8/300.0 kB 1.5 MB/s eta 0:00:01
   ------------------------------------ --- 276.5/300.0 kB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 300.0/300.0 kB 1.4 MB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.3
Note: you may need to restart the kernel to use updated packages.


In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define SVM model
svm = SVC(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model on the TF-IDF transformed training set with the best hyperparameters
best_svm = SVC(**best_params, random_state=42)
best_svm.fit(X_train_tfidf, y_train)

# Evaluate the model on the TF-IDF transformed validation set
val_preds_svm = best_svm.predict(X_val_tfidf)
val_accuracy_svm = accuracy_score(y_val, val_preds_svm)
print("Validation Accuracy (SVM):", val_accuracy_svm)
print("Classification Report (SVM - Validation):\n", classification_report(y_val, val_preds_svm))

# Evaluate the model on the TF-IDF transformed test set
test_preds_svm = best_svm.predict(X_test_tfidf)
test_accuracy_svm = accuracy_score(y_test, test_preds_svm)
print("Test Accuracy (SVM):", test_accuracy_svm)
print("Classification Report (SVM - Test):\n", classification_report(y_test, test_preds_svm))


Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Validation Accuracy (SVM): 0.749708284714119
Classification Report (SVM - Validation):
               precision    recall  f1-score   support

           1       0.93      0.93      0.93       455
           2       0.54      0.46      0.49       402
           3       0.95      0.96      0.95       405
           4       0.58      0.64      0.61       452

    accuracy                           0.75      1714
   macro avg       0.75      0.75      0.75      1714
weighted avg       0.75      0.75      0.75      1714

Test Accuracy (SVM): 0.7634157722818479
Classification Report (SVM - Test):
               precision    recall  f1-score   support

           1       0.92      0.93      0.92       579
           2       0.58      0.48      0.52       505
           3       0.95      0.97      0.96       506
           4       0.59      0.66      0.62       553

    accuracy                           0.76      2143
   macr

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Define Naive Bayes model
nb = MultinomialNB()

# Train the model on the TF-IDF transformed training set
nb.fit(X_train_tfidf, y_train)

# Evaluate the model on the TF-IDF transformed validation set
val_preds_nb = nb.predict(X_val_tfidf)
val_accuracy_nb = accuracy_score(y_val, val_preds_nb)
print("Validation Accuracy (Naive Bayes):", val_accuracy_nb)
print("Classification Report (Naive Bayes - Validation):\n", classification_report(y_val, val_preds_nb))

# Evaluate the model on the TF-IDF transformed test set
test_preds_nb = nb.predict(X_test_tfidf)
test_accuracy_nb = accuracy_score(y_test, test_preds_nb)
print("Test Accuracy (Naive Bayes):", test_accuracy_nb)
print("Classification Report (Naive Bayes - Test):\n", classification_report(y_test, test_preds_nb))


Validation Accuracy (Naive Bayes): 0.5915985997666278
Classification Report (Naive Bayes - Validation):
               precision    recall  f1-score   support

           1       0.76      0.67      0.71       455
           2       0.48      0.24      0.32       402
           3       0.77      0.71      0.74       405
           4       0.44      0.71      0.54       452

    accuracy                           0.59      1714
   macro avg       0.61      0.58      0.58      1714
weighted avg       0.61      0.59      0.58      1714

Test Accuracy (Naive Bayes): 0.5725618292113859
Classification Report (Naive Bayes - Test):
               precision    recall  f1-score   support

           1       0.72      0.63      0.67       579
           2       0.49      0.26      0.34       505
           3       0.72      0.67      0.70       506
           4       0.43      0.71      0.54       553

    accuracy                           0.57      2143
   macro avg       0.59      0.57      0.

In [56]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Ensure your class labels start from 0
y_train_adjusted = y_train - 1
y_val_adjusted = y_val - 1
y_test_adjusted = y_test - 1

# Define hyperparameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Perform Grid Search CV
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_xgb.fit(X_train_tfidf, y_train_adjusted)

# Best hyperparameters
print("Best hyperparameters for XGBoost:", grid_search_xgb.best_params_)

# Evaluate on the validation set
val_preds_xgb = grid_search_xgb.predict(X_val_tfidf)
val_accuracy_xgb = accuracy_score(y_val_adjusted, val_preds_xgb)
print("Validation Accuracy (XGBoost):", val_accuracy_xgb)
print("Validation Classification Report (XGBoost):")
print(classification_report(y_val_adjusted, val_preds_xgb, target_names=['class 0', 'class 1', 'class 2', 'class 3']))

# Evaluate on the test set
test_preds_xgb = grid_search_xgb.predict(X_test_tfidf)
test_accuracy_xgb = accuracy_score(y_test_adjusted, test_preds_xgb)
print("Test Accuracy (XGBoost):", test_accuracy_xgb)
print("Test Classification Report (XGBoost):")
print(classification_report(y_test_adjusted, test_preds_xgb, target_names=['class 0', 'class 1', 'class 2', 'class 3']))


Fitting 3 folds for each of 108 candidates, totalling 324 fits


KeyboardInterrupt: 

In [83]:
pip install transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.9.5-cp310-cp310-win_amd64.whl.metadata (7.7 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.23.3-py3-none-any.whl.metadata (12 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.26.2 which is incompatible.


In [85]:
pip install --upgrade datasets transformers


^C
Note: you may need to restart the kernel to use updated packages.


In [9]:
X = balanced_data['processed_review_content']
y = balanced_data['final_label']

data = pd.DataFrame({'review': X, 'label': y})

# Check the dataset
data.head()


Unnamed: 0,review,label
0,[supplication least give u way app transpositi...,1
1,[facebook sell information freak disabl track ...,1
2,[could give zero star would. four month ago ac...,1
3,[account pay back hack long ago itÂ  ridicul ...,3
4,[mark z censor conserv sell data],3


In [10]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [12]:
from datasets import Dataset
from transformers import RobertaTokenizer

# Convert non-string values in the 'review' column to strings
train_df['review'] = train_df['review'].astype(str)
val_df['review'] = val_df['review'].astype(str)
test_df['review'] = test_df['review'].astype(str)

# Convert pandas dataframe to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)




vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/7499 [00:00<?, ? examples/s]

Map:   0%|          | 0/1607 [00:00<?, ? examples/s]

Map:   0%|          | 0/1607 [00:00<?, ? examples/s]

In [13]:
train_dataset = train_dataset.rename_column('label', 'labels')
val_dataset = val_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [27]:
pip install transformers[torch] accelerate -U


Collecting transformers[torch]
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers[torch])
  Using cached tokenizers-0.19.1-cp310-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers[torch])
  Using cached safetensors-0.4.3-cp310-none-win_amd64.whl.metadata (3.9 kB)
Downloading safetensors-0.4.3-cp310-none-win_amd64.whl (287 kB)
   ---------------------------------------- 0.0/287.4 kB ? eta -:--:--
   ---- ----------------------------------- 30.7/287.4 kB ? eta -:--:--
   ---- ----------------------------------- 30.7/287.4 kB ? eta -:--:--
   ---- ----------------------------------- 30.7/287.4 kB ? eta -:--:--
   ------------ -------------------------- 92.2/287.4 kB 581.0 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/287.4 kB 656.4 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/287.4 kB 656.4 kB/s eta 0:00:01
   ------------------------------- ----

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy-transformers 1.2.5 requires transformers<4.31.0,>=3.4.0, but you have transformers 4.41.2 which is incompatible.


In [1]:
import transformers
import accelerate

print(transformers.__version__)
print(accelerate.__version__)


4.41.2
0.31.0
