### Configurações de ambiente

In [7]:
# === Standard Library ===
import os
import re
import copy
import pickle
import random

# === Third-party Libraries ===

# Data handling
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Visualization
from great_tables import GT, md, html
import matplotlib.pyplot as plt
import seaborn as sns
import plotly

# Plotly high-resolution export settings
plotly.io.renderers.default = "png"

# Set default font to Helvetica Neue in Plotly
plotly.io.templates["custom"] = plotly.io.templates["plotly_white"]
plotly.io.templates["custom"].layout.font.family = "Helvetica Neue"
plotly.io.templates.default = "custom"

# Matplotlib high-resolution settings
plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["font.family"] = "Helvetica Neue"

# Machine Learning & Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    auc,
    classification_report,
    confusion_matrix,
    roc_curve,
)

# Optimization
# import optuna

# External integrations
import kagglehub

# === Project-specific Imports ===

SEED = 123

In [8]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /Users/vkz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/vkz/nltk_data...
[nltk_data] Downloading package stopwords to /Users/vkz/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /Users/vkz/nltk_data...


True

In [2]:
path = kagglehub.dataset_download("charunisa/chatgpt-sentiment-analysis")
df = pd.read_csv(os.path.join(path, "file.csv"))
df = df.drop(columns=["Unnamed: 0"])
label_map = {"bad": 0, "neutral": 1, "good": 2}
df["labels"] = df["labels"].map(label_map)
df

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,1
1,"Try talking with ChatGPT, our new AI system wh...",2
2,ChatGPT: Optimizing Language Models for Dialog...,1
3,"THRILLED to share that ChatGPT, our new model ...",2
4,"As of 2 minutes ago, @OpenAI released their ne...",0
...,...,...
219289,Other Software Projects Are Now Trying to Repl...,0
219290,I asked #ChatGPT to write a #NYE Joke for SEOs...,2
219291,chatgpt is being disassembled until it can onl...,0
219292,2023 predictions by #chatGPT. Nothing really s...,0


In [3]:
bad_df = df[df["labels"] == 0]
neutral_df = df[df["labels"] == 1]
good_df = df[df["labels"] == 2]

min_count = min(len(bad_df), len(neutral_df), len(good_df))

bad_bal = resample(bad_df, n_samples=min_count, random_state=SEED)
neutral_bal = resample(neutral_df, n_samples=min_count, random_state=SEED)
good_bal = resample(good_df, n_samples=min_count, random_state=SEED)

df_balanced = (
    pd.concat([bad_bal, neutral_bal, good_bal])
    .sample(frac=1, random_state=SEED)
    .reset_index(drop=True)
)

df = df_balanced.copy()
print(df["labels"].value_counts())

labels
1    55487
2    55487
0    55487
Name: count, dtype: int64


In [None]:
def strip_text(text: str) -> str:
    text = text.replace("\\n", " ").replace("\n", " ")  # remove newlines
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)  # remove mentions
    text = re.sub(r"#\w+", "", text)  # remove hashtags
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # normalize whitespace
    return text

In [5]:
df["stripped_tweets"] = df["tweets"].apply(strip_text)
df

Unnamed: 0,tweets,labels,stripped_tweets
0,I think I'll always remember the mix of awe &a...,1,i think ill always remember the mix of awe amp...
1,I too realized this the moment I used ChatGPT....,1,i too realized this the moment i used chatgpt ...
2,ChatGPT is mind-blowing.\nRunning out of ideas...,2,chatgpt is mindblowing running out of ideas fo...
3,For those saying ChatGPT will replace GOOG / S...,0,for those saying chatgpt will replace goog sta...
4,"Free speech is a fundamental right, but it is ...",2,free speech is a fundamental right but it is n...
...,...,...,...
166456,"This ChatGPT is weird, still sounds like a cus...",2,this chatgpt is weird still sounds like a cust...
166457,Can’t wait to see ChatGPT but with pronunciati...,2,cant wait to see chatgpt but with pronunciatio...
166458,ChatGPT is 🔥 https://t.co/xuMdkySoFM,0,chatgpt is
166459,ChatGPT is going to completely wreck content m...,0,chatgpt is going to completely wreck content m...


In [6]:
train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.3,
    stratify=df_balanced["labels"],
    random_state=SEED,
)

train_df, holdout_df = train_test_split(
    train_df,
    test_size=0.1,
    stratify=train_df["labels"],
    random_state=SEED,
)

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")
print(f"Holdout size: {len(holdout_df)}")

print("\nTrain distribution:\n", train_df["labels"].value_counts())
print("\nTest distribution:\n", test_df["labels"].value_counts())
print("\nHoldout distribution:\n", holdout_df["labels"].value_counts())

Train size: 104869
Test size: 49939
Holdout size: 11653

Train distribution:
 labels
1    34957
0    34956
2    34956
Name: count, dtype: int64

Test distribution:
 labels
2    16647
1    16646
0    16646
Name: count, dtype: int64

Holdout distribution:
 labels
0    3885
1    3884
2    3884
Name: count, dtype: int64
