In [None]:
# Imports
import pandas as pd
import re
import random
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder

from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
import nltk

from imblearn.over_sampling import SMOTE

# Model imports:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


## NAIVE BAYES

### Preprocessing and Augmentations

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing and cleaning text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in words if word not in stop_words)
    return text

# Synonym-based augmentation
def augmented_text(text):
    words = text.split()
    new_words = []
    for word in words:
        syns = wordnet.synsets(word)
        if syns:
            synonyms = syns[0].lemma_names()
            if synonyms:
                word = synonyms[0]
        if random.uniform(0, 1) > 0.2:
            new_words.append(word)
    return ' '.join(new_words)


data = pd.read_csv('reviews.csv')

# Generating cleaned and augmented versions
augmented_data = []
for index, row in data.iterrows():
    cleaned_text = clean_text(row['Review'])
    augmented = augmented_text(row['Review'])
    augmented_data.append((cleaned_text, augmented, row['Label']))

# Converting rating to sentiment classes
def define_sentiment(rating):
    if isinstance(rating, int) and rating in range(1, 6):
        if rating >= 4:
            return 'Pos'
        elif rating == 3:
            return 'Neut'
        else:
            return 'Neg'
    else:
        return 'unknown'




### Dataframe creation and conversions

In [None]:
# Creating DataFrame with processed text and sentiment labels
augmented_df = pd.DataFrame(augmented_data, columns=['cleaned_text', 'augmented_text', 'Label'])
augmented_df['sentiment'] = augmented_df['Label'].apply(define_sentiment)
filtered_data = augmented_df[augmented_df['sentiment'] != 'unknown']

# Encoding
label_encoder = LabelEncoder()
filtered_data['sentiment_class'] = label_encoder.fit_transform(filtered_data['sentiment'])

# Vectorization
vectorizer_cleaned = CountVectorizer()
X_cleaned = vectorizer_cleaned.fit_transform(filtered_data['cleaned_text'])
vectorizer_augmented = CountVectorizer()
X_augmented = vectorizer_augmented.fit_transform(filtered_data['augmented_text'])


# Applying SMOTE to balance class distributions
smote_augmented = SMOTE(random_state=42)
X_augmented_resampled, y_augmented_resampled = smote_augmented.fit_resample(X_augmented, filtered_data['sentiment_class'])
smote_cleaned = SMOTE(random_state=42)
X_cleaned_resampled, y_cleaned_resampled = smote_cleaned.fit_resample(X_cleaned, filtered_data['sentiment_class'])





### Main Training/Testing BLOCK

In [None]:

# Train/Testing BLOCK
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned_resampled, y_cleaned_resampled, test_size=0.2, random_state=42)
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(X_augmented_resampled, y_augmented_resampled, test_size=0.2, random_state=42)

# Naive Bayes model
def train_evaluate_model_nb(X_train, y_train, X_test, y_test, text_type):
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train, y_train)

    # Training
    y_pred_train = nb_classifier.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)

    # Cross-validation
    scores_cv = cross_val_score(nb_classifier, X_train, y_train, cv=10)
    mean_accuracy_cv = scores_cv.mean()

    # Testing
    y_pred_test = nb_classifier.predict(X_test)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    report_test = classification_report(y_test, y_pred_test, target_names=['Neg', 'Neut', 'Pos'])
    conf_matrix_test = confusion_matrix(y_test, y_pred_test)

    # Accuracies
    class_labels = label_encoder.classes_
    class_accuracies = {}
    for label in range(len(class_labels)):
        mask = (y_test == label)
        class_accuracy = accuracy_score(y_test[mask], y_pred_test[mask])
        class_accuracies[class_labels[label]] = class_accuracy

    # Printing
    print(f"\nResults for {text_type} Text using Naive Bayes:")
    print(f'Training Accuracy: {accuracy_train}')
    print(f'Mean Cross Validation Accuracy: {mean_accuracy_cv}')
    print(f'Test Set Accuracy: {accuracy_test}')
    print("Class-wise Accuracies for Test Set:")
    for label, acc in class_accuracies.items():
        print(f"{label} Accuracy: {acc}")
    print("Classification Report for Test Set:")
    print(report_test)
    print("Confusion Matrix for Test Set:")
    print(conf_matrix_test)

    return nb_classifier




nb_classifier_cleaned = train_evaluate_model_nb(X_train_cleaned, y_train_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
nb_classifier_augmented = train_evaluate_model_nb(X_train_augmented, y_train_augmented, X_test_augmented, y_test_augmented, "Augmented")




### Plotting and Results

In [None]:

# PLotting
# Sentiment distribution
def plot_sentiment_distribution(data, title):
    plt.figure(figsize=(8, 6))
    data.value_counts().sort_index().plot(kind='bar', color='blue' if 'Original' in title else ('green' if 'Cleaned' in title else 'orange'))
    plt.title(title)
    plt.xlabel('Sentiment Class')
    plt.ylabel('Number of Sentences')
    plt.xticks(rotation=0)
    plt.show()

# Displaying predicted labels
def make_predictions(classifier, X_test, y_test, text_type):
    y_pred = classifier.predict(X_test)
    predicted_labels = label_encoder.inverse_transform(y_pred)
    predictions = pd.DataFrame({'True_Label': label_encoder.inverse_transform(y_test), 'Predicted_Label': predicted_labels})

    plt.figure(figsize=(8, 6))
    predictions['Predicted_Label'].value_counts().sort_index().plot(kind='bar', color='green' if text_type == "Cleaned" else 'orange')
    plt.title(f'Distribution of Predicted Sentiment Classes for {text_type} Text')
    plt.xlabel('Sentiment Class')
    plt.ylabel('Number of Sentences')
    plt.xticks(rotation=0)
    plt.show()



# ROC curve
def plot_manual_roc(classifier, X_test, y_test, text_type):
    y_score = classifier.predict_proba(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(len(label_encoder.classes_)):
        fpr[i], tpr[i], _ = roc_curve(y_test, y_score[:, i], pos_label=i)
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(8, 6))
    colors = ['blue', 'green', 'orange']
    for i, color in zip(range(len(label_encoder.classes_)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (area = {roc_auc[i]:.2f}) for {label_encoder.classes_[i]}')

    plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {text_type} Text')
    plt.legend(loc='lower right')
    plt.show()


# Plotting/displaying calls
plot_sentiment_distribution(data['Label'], 'Distribution of Sentences across Defined Sentiment Classes for Original Dataset')
plot_sentiment_distribution(filtered_data['sentiment'], 'Distribution of Sentences across Sentiment Classes for Cleaned Text')
plot_sentiment_distribution(augmented_df['sentiment'], 'Distribution of Sentences across Sentiment Classes for Augmented Text')

make_predictions(nb_classifier_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
make_predictions(nb_classifier_augmented, X_test_augmented, y_test_augmented, "Augmented")

plot_manual_roc(nb_classifier_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
plot_manual_roc(nb_classifier_augmented, X_test_augmented, y_test_augmented, "Augmented")

## DECISION TREE

### Preprocessing and Augmentations

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Cleaning text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in words if word not in stop_words)
    return text

# Synonym-based augmentation
def augmented_text(text):
    words = text.split()
    new_words = []
    for word in words:
        syns = wordnet.synsets(word)
        if syns:
            synonyms = syns[0].lemma_names()
            if synonyms:
                word = synonyms[0]
        if random.uniform(0, 1) > 0.2:
            new_words.append(word)
    return ' '.join(new_words)


data = pd.read_csv('reviews.csv')

# Generating cleaned and augmented versions
stop_words = set(stopwords.words('english'))
augmented_data = []
for index, row in data.iterrows():
    cleaned_text = clean_text(row['Review'])
    augmented = augmented_text(row['Review'])
    augmented_data.append((cleaned_text, augmented, row['Label']))

# Converting rating to sentiment classes
def define_sentiment(rating):
    if isinstance(rating, int) and rating in range(1, 6):
        if rating >= 4:
            return 'Pos'
        elif rating == 3:
            return 'Neut'
        else:
            return 'Neg'
    else:
        return 'unknown'


### Dataframe creation and conversions

In [None]:

# Creating DataFrame with processed text and sentiment labels
augmented_df = pd.DataFrame(augmented_data, columns=['cleaned_text', 'augmented_text', 'Label'])
augmented_df['sentiment'] = augmented_df['Label'].apply(define_sentiment)
filtered_data = augmented_df[augmented_df['sentiment'] != 'unknown']

# Encoding
label_encoder = LabelEncoder()
filtered_data['sentiment_class'] = label_encoder.fit_transform(filtered_data['sentiment'])

# Vectorization
vectorizer_cleaned = CountVectorizer()
X_cleaned = vectorizer_cleaned.fit_transform(filtered_data['cleaned_text'])
vectorizer_augmented = CountVectorizer()
X_augmented = vectorizer_augmented.fit_transform(filtered_data['augmented_text'])

# Applying SMOTE to balance class distributions
smote_augmented = SMOTE(random_state=42)
X_augmented_resampled, y_augmented_resampled = smote_augmented.fit_resample(X_augmented, filtered_data['sentiment_class'])
smote_cleaned = SMOTE(random_state=42)
X_cleaned_resampled, y_cleaned_resampled = smote_cleaned.fit_resample(X_cleaned, filtered_data['sentiment_class'])



### Main Training/Testing BLOCK

In [None]:

# Train/Testing BLOCK
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned_resampled, y_cleaned_resampled, test_size=0.2, random_state=42)
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(X_augmented_resampled, y_augmented_resampled, test_size=0.2, random_state=42)

# Decision Tree model
def train_evaluate_model_dt(X_train, y_train, X_test, y_test, text_type):
    dt_classifier = DecisionTreeClassifier()
    dt_classifier.fit(X_train, y_train)

    # Training
    y_pred_train = dt_classifier.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)

    # Cross-validation
    scores_cv = cross_val_score(dt_classifier, X_train, y_train, cv=10)
    mean_accuracy_cv = scores_cv.mean()

    # Testing
    y_pred_test = dt_classifier.predict(X_test)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    report_test = classification_report(y_test, y_pred_test, target_names=['Neg', 'Neut', 'Pos'])
    conf_matrix_test = confusion_matrix(y_test, y_pred_test)

    # Accuracies
    class_labels = label_encoder.classes_
    class_accuracies = {}
    for label in range(len(class_labels)):
        mask = (y_test == label)
        class_accuracy = accuracy_score(y_test[mask], y_pred_test[mask])
        class_accuracies[class_labels[label]] = class_accuracy

    # Printing
    print(f"\nResults for {text_type} Text using Decision Tree:")
    print(f'Training Accuracy: {accuracy_train}')
    print(f'Mean Cross Validation Accuracy: {mean_accuracy_cv}')
    print(f'Test Set Accuracy: {accuracy_test}')
    print("Class-wise Accuracies for Test Set:")
    for label, acc in class_accuracies.items():
        print(f"{label} Accuracy: {acc}")
    print("Classification Report for Test Set:")
    print(report_test)
    print("Confusion Matrix for Test Set:")
    print(conf_matrix_test)

    return dt_classifier

# Training Decision Tree models
dt_classifier_cleaned = train_evaluate_model_dt(X_train_cleaned, y_train_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
dt_classifier_augmented = train_evaluate_model_dt(X_train_augmented, y_train_augmented, X_test_augmented, y_test_augmented, "Augmented")


### Plotting and Results

In [None]:

# Plotting
# Sentiment distribution
def plot_sentiment_distribution(data, title):
    plt.figure(figsize=(8, 6))
    data.value_counts().sort_index().plot(kind='bar', color='blue' if 'Original' in title else ('green' if 'Cleaned' in title else 'orange'))
    plt.title(title)
    plt.xlabel('Sentiment Class')
    plt.ylabel('Number of Sentences')
    plt.xticks(rotation=0)
    plt.show()

# Displaying predicted labels
def make_predictions(classifier, X_test, y_test, text_type):
    y_pred = classifier.predict(X_test)
    predicted_labels = label_encoder.inverse_transform(y_pred)
    predictions = pd.DataFrame({'True_Label': label_encoder.inverse_transform(y_test), 'Predicted_Label': predicted_labels})

    plt.figure(figsize=(8, 6))
    predictions['Predicted_Label'].value_counts().sort_index().plot(kind='bar', color='green' if text_type == "Cleaned" else 'orange')
    plt.title(f'Distribution of Predicted Sentiment Classes for {text_type} Text')
    plt.xlabel('Sentiment Class')
    plt.ylabel('Number of Sentences')
    plt.xticks(rotation=0)
    plt.show()

# ROC curve
def plot_manual_roc(classifier, X_test, y_test, text_type):
    y_score = classifier.predict_proba(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(len(label_encoder.classes_)):
        fpr[i], tpr[i], _ = roc_curve(y_test, y_score[:, i], pos_label=i)
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(8, 6))
    colors = ['blue', 'green', 'orange']
    for i, color in zip(range(len(label_encoder.classes_)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (area = {roc_auc[i]:.2f}) for {label_encoder.classes_[i]}')

    plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {text_type} Text')
    plt.legend(loc='lower right')
    plt.show()

# Plotting/displaying calls
plot_sentiment_distribution(data['Label'], 'Distribution of Sentences across Defined Sentiment Classes for Original Dataset')
plot_sentiment_distribution(filtered_data['sentiment'], 'Distribution of Sentences across Sentiment Classes for Cleaned Text')
plot_sentiment_distribution(augmented_df['sentiment'], 'Distribution of Sentences across Sentiment Classes for Augmented Text')

make_predictions(dt_classifier_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
make_predictions(dt_classifier_augmented, X_test_augmented, y_test_augmented, "Augmented")

plot_manual_roc(dt_classifier_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
plot_manual_roc(dt_classifier_augmented, X_test_augmented, y_test_augmented, "Augmented")

##RANDOM FOREST

### Preprocessing and Augmentations

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Cleaning text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in words if word not in stop_words)
    return text

# Synonym-based augmentation
def augmented_text(text):
    words = text.split()
    new_words = []
    for word in words:
        syns = wordnet.synsets(word)
        if syns:
            synonyms = syns[0].lemma_names()
            if synonyms:
                word = synonyms[0]
        if random.uniform(0, 1) > 0.2:
            new_words.append(word)
    return ' '.join(new_words)

# Load dataset
data = pd.read_csv('reviews.csv')

# Generating cleaned and augmented versions
stop_words = set(stopwords.words('english'))
augmented_data = []
for index, row in data.iterrows():
    cleaned_text = clean_text(row['Review'])
    augmented = augmented_text(row['Review'])
    augmented_data.append((cleaned_text, augmented, row['Label']))

# Converting rating to sentiment classes
def define_sentiment(rating):
    if isinstance(rating, int) and rating in range(1, 6):
        if rating >= 4:
            return 'Pos'
        elif rating == 3:
            return 'Neut'
        else:
            return 'Neg'
    else:
        return 'unknown'


### Dataframe creation and conversions

In [None]:

# Creating DataFrame with processed text and sentiment labels
augmented_df = pd.DataFrame(augmented_data, columns=['cleaned_text', 'augmented_text', 'Label'])
augmented_df['sentiment'] = augmented_df['Label'].apply(define_sentiment)
filtered_data = augmented_df[augmented_df['sentiment'] != 'unknown']

# Encoding
label_encoder = LabelEncoder()
filtered_data['sentiment_class'] = label_encoder.fit_transform(filtered_data['sentiment'])

# Vectorization
vectorizer_cleaned = CountVectorizer()
X_cleaned = vectorizer_cleaned.fit_transform(filtered_data['cleaned_text'])
vectorizer_augmented = CountVectorizer()
X_augmented = vectorizer_augmented.fit_transform(filtered_data['augmented_text'])

# Applying SMOTE to balance class distributions
smote_augmented = SMOTE(random_state=42)
X_augmented_resampled, y_augmented_resampled = smote_augmented.fit_resample(X_augmented, filtered_data['sentiment_class'])
smote_cleaned = SMOTE(random_state=42)
X_cleaned_resampled, y_cleaned_resampled = smote_cleaned.fit_resample(X_cleaned, filtered_data['sentiment_class'])



### Main Training/Testing BLOCK

In [None]:
# Train/Testing BLOCK
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned_resampled, y_cleaned_resampled, test_size=0.2, random_state=42)
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(X_augmented_resampled, y_augmented_resampled, test_size=0.2, random_state=42)

# Random Forest model
def train_evaluate_model_rf(X_train, y_train, X_test, y_test, text_type):
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Training
    y_pred_train = rf_classifier.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)

    # Cross-validation
    scores_cv = cross_val_score(rf_classifier, X_train, y_train, cv=10)
    mean_accuracy_cv = scores_cv.mean()

    # Testing
    y_pred_test = rf_classifier.predict(X_test)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    report_test = classification_report(y_test, y_pred_test, target_names=['Neg', 'Neut', 'Pos'])
    conf_matrix_test = confusion_matrix(y_test, y_pred_test)

    # Accuracies
    class_labels = label_encoder.classes_
    class_accuracies = {}
    for label in range(len(class_labels)):
        mask = (y_test == label)
        class_accuracy = accuracy_score(y_test[mask], y_pred_test[mask])
        class_accuracies[class_labels[label]] = class_accuracy

    # Printing
    print(f"\nResults for {text_type} Text using Random Forest:")
    print(f'Training Accuracy: {accuracy_train}')
    print(f'Mean Cross Validation Accuracy: {mean_accuracy_cv}')
    print(f'Test Set Accuracy: {accuracy_test}')
    print("Class-wise Accuracies for Test Set:")
    for label, acc in class_accuracies.items():
        print(f"{label} Accuracy: {acc}")
    print("Classification Report for Test Set:")
    print(report_test)
    print("Confusion Matrix for Test Set:")
    print(conf_matrix_test)

    return rf_classifier

# Training Random Forest models
rf_classifier_cleaned = train_evaluate_model_rf(X_train_cleaned, y_train_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
rf_classifier_augmented = train_evaluate_model_rf(X_train_augmented, y_train_augmented, X_test_augmented, y_test_augmented, "Augmented")




### Plotting and Results

In [None]:
# Plotting
# Sentiment distribution
def plot_sentiment_distribution(data, title):
    plt.figure(figsize=(8, 6))
    data.value_counts().sort_index().plot(kind='bar', color='blue' if 'Original' in title else ('green' if 'Cleaned' in title else 'orange'))
    plt.title(title)
    plt.xlabel('Sentiment Class')
    plt.ylabel('Number of Sentences')
    plt.xticks(rotation=0)
    plt.show()

# Displaying predicted labels
def make_predictions(classifier, X_test, y_test, text_type):
    y_pred = classifier.predict(X_test)
    predicted_labels = label_encoder.inverse_transform(y_pred)
    predictions = pd.DataFrame({'True_Label': label_encoder.inverse_transform(y_test), 'Predicted_Label': predicted_labels})

    plt.figure(figsize=(8, 6))
    predictions['Predicted_Label'].value_counts().sort_index().plot(kind='bar', color='green' if text_type == "Cleaned" else 'orange')
    plt.title(f'Distribution of Predicted Sentiment Classes for {text_type} Text')
    plt.xlabel('Sentiment Class')
    plt.ylabel('Number of Sentences')
    plt.xticks(rotation=0)
    plt.show()

# ROC curve
def plot_manual_roc(classifier, X_test, y_test, text_type):
    y_score = classifier.predict_proba(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(len(label_encoder.classes_)):
        fpr[i], tpr[i], _ = roc_curve(y_test, y_score[:, i], pos_label=i)
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(8, 6))
    colors = ['blue', 'green', 'orange']
    for i, color in zip(range(len(label_encoder.classes_)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (area = {roc_auc[i]:.2f}) for {label_encoder.classes_[i]}')

    plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {text_type} Text')
    plt.legend(loc='lower right')
    plt.show()

# Plotting/displaying calls
plot_sentiment_distribution(data['Label'], 'Distribution of Sentences across Defined Sentiment Classes for Original Dataset')
plot_sentiment_distribution(filtered_data['sentiment'], 'Distribution of Sentences across Sentiment Classes for Cleaned Text')
plot_sentiment_distribution(augmented_df['sentiment'], 'Distribution of Sentences across Sentiment Classes for Augmented Text')

make_predictions(rf_classifier_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
make_predictions(rf_classifier_augmented, X_test_augmented, y_test_augmented, "Augmented")

plot_manual_roc(rf_classifier_cleaned, X_test_cleaned, y_test_cleaned, "Cleaned")
plot_manual_roc(rf_classifier_augmented, X_test_augmented, y_test_augmented, "Augmented")