In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [None]:
from textblob import TextBlob
# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
# Load the dataset
df = pd.read_csv('Thesis Labeled Dataset.csv', encoding='ISO-8859-1')

df.head

## Preprocessing Steps


In [None]:
import re

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation using regular expressions
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Tokenization using TextBlob
    tokens = TextBlob(text).words
    
    # Stemming using NLTK PorterStemmer
    porter_stemmer = PorterStemmer()
    tokens = [porter_stemmer.stem(word) for word in tokens]
    
    # Lemmatization using NLTK WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)



df['processed_review_content'] = df['review_content'].apply(preprocess_text)

## Data Augmentation Using NLPaug

In [None]:
import nlpaug.augmenter.word as naw

# Instantiate augmentation techniques
augmenter = naw.SynonymAug(aug_src='wordnet')

# Augment data for minority classes
augmented_data = []
for label in [1, 3]:
    minority_data = df[df['final_label'] == label]['processed_review_content']
    # Determine the augmentation factor based on the ratio of minority to majority class
    augmentation_factor = int((df['final_label'].value_counts().max() / len(minority_data)) - 1)
    for text in minority_data:
        for _ in range(augmentation_factor):
            augmented_text = augmenter.augment(text)
            augmented_data.append({'processed_review_content': augmented_text, 'final_label': label})

# Combine original and augmented data
balanced_data = pd.concat([df, pd.DataFrame(augmented_data)])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

# Check the class distribution
print(balanced_data['final_label'].value_counts())




In [1]:
balanced_data = pd.read_csv('../Dataset/balanced_data_with_value_counts.csv')


NameError: name 'pd' is not defined