In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import nltk
import contractions
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import time
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_input_csv = 'comments_2023-06-17_2133_clean.csv'

In [3]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.device_count() > 0 else "None")

Torch version: 2.0.0
CUDA available: True
CUDA device count: 1
CUDA device name: Quadro RTX 5000


In [4]:
path_input_csv_zip = path_input_csv + ".zip"
df_original = pd.read_csv(path_input_csv_zip, sep=';', compression='zip', encoding='utf-8')
df_sa = df_original.copy()

## sentiment analysis

In [5]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download("wordnet")

stop_words = set(stopwords.words("english"))

In [6]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=False)

if torch.cuda.is_available():
    model.to("cuda")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
word_length_max = 30

def preprocess_text(text):
    # Handle Nan/floating values
    text = "" if pd.isna(text) else text

    # Expand contractions
    text = contractions.fix(text)

    # Remove special characters and digits
    text = re.sub(r"[^a-zA-Z\s]+", "", text)

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stop words, lemmatize, and filter long words
    filtered_tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in stop_words and len(token) < word_length_max
    ]

    return " ".join(filtered_tokens)

In [8]:
# def batch_analyze_sentiment(comments, batch_size=4, preprocess=True):
#     # Preprocess comments
#     if preprocess:
#         comments = [preprocess_text(comment) for comment in tqdm(comments, desc="Preprocessing")]

#     # Create DataLoader
#     inputs = tokenizer(comments, return_tensors="pt", padding=True, truncation=True)
#     dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
#     dataloader = DataLoader(dataset, batch_size=batch_size)

#     all_labels = []
#     all_scores = []

#     progress_bar = tqdm(
#         dataloader,
#         desc="Processing batches",
#         bar_format="{desc}: {percentage:.1f}% | Batch {n_fmt}/{total_fmt} | {bar}",
#     )

#     # Perform sentiment analysis
#     for batch in progress_bar:
#         input_ids, attention_mask = batch

#         # Move tensors to GPU if available
#         if torch.cuda.is_available():
#             input_ids = input_ids.to("cuda")
#             attention_mask = attention_mask.to("cuda")

#         with torch.no_grad():
#             outputs = model(input_ids, attention_mask=attention_mask)

#         # Get labels and scores
#         logits = outputs.logits
#         probabilities = torch.softmax(logits, dim=-1)
#         label_indices = torch.argmax(logits, dim=-1)
#         labels = [model.config.id2label[label_id.item()] for label_id in label_indices]
#         scores = probabilities.max(dim=-1).values.tolist()

#         all_labels.extend(labels)
#         all_scores.extend(scores)

#     return all_labels, all_scores

In [9]:
def batch_analyze_sentiment(comments, batch_size=4, preprocess=True):
    # Preprocess comments
    if preprocess:
        comments = [preprocess_text(comment) for comment in tqdm(comments, desc="Preprocessing")]

    # Create DataLoader
    inputs = tokenizer(comments, return_tensors="pt", padding=True, truncation=True)
    dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_labels = []
    all_scores = []

    progress_bar = tqdm(
        dataloader,
        desc="Processing batches",
        bar_format="{desc}: {percentage:.1f}% | Batch {n_fmt}/{total_fmt} | {bar}",
    )

    # Enable mixed-precision training
    scaler = torch.cuda.amp.GradScaler()

    # Perform sentiment analysis
    for batch in progress_bar:
        input_ids, attention_mask = batch

        # Move tensors to GPU if available
        if torch.cuda.is_available():
            input_ids = input_ids.to("cuda")
            attention_mask = attention_mask.to("cuda")

        with torch.no_grad(), torch.cuda.amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask)

        # Get labels and scores
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        label_indices = torch.argmax(logits, dim=-1)
        labels = [model.config.id2label[label_id.item()] for label_id in label_indices]
        scores = probabilities.max(dim=-1).values.tolist()

        all_labels.extend(labels)
        all_scores.extend(scores)

    return all_labels, all_scores


In [10]:
batch_size = 256

# Perform sentiment analysis
display("Performing sentiment analysis on replies...")
df_sa['SA_label_reply_text'], df_sa['SA_score_reply_text'] =\
    batch_analyze_sentiment(df_sa['Reply_Text'].tolist(), batch_size=batch_size)

display("Performing sentiment analysis on comments...")
df_sa['SA_label_comment_text'], df_sa['SA_score_comment_text'] =\
    batch_analyze_sentiment(df_sa['Comment_Text'].tolist(), batch_size=batch_size)

'Performing sentiment analysis on replies...'

Preprocessing: 100%|██████████| 433870/433870 [00:25<00:00, 16754.92it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing batches: 100.0% | Batch 1695/1695 | ██████████


'Performing sentiment analysis on comments...'

Preprocessing: 100%|██████████| 433870/433870 [01:07<00:00, 6452.99it/s]
Processing batches: 100.0% | Batch 1695/1695 | ██████████


### Remove label/score in the empty reply

In [11]:
df_sa['SA_score_reply_text'] = df_sa.apply(
    lambda row: 0 if pd.isna(row['Reply_Text']) or row['Reply_Text'] == '' else row['SA_score_reply_text'],
    axis=1
)

df_sa['SA_label_reply_text'] = df_sa.apply(
    lambda row: np.nan if pd.isna(row['Reply_Text']) or row['Reply_Text'] == '' else row['SA_label_reply_text'],
    axis=1
)

In [12]:
# Calculate the sentiment score for each comment and reply
# Handle empty comments and replies
def sentiment_score(row, label, score, text):
    if pd.isna(row[text]):
        return np.nan
    elif row[label] == 'positive':
        return row[score]
    elif row[label] == 'negative':
        return -1*row[score]
    else:
        return 0


df_sa['Sentiment_Score_comment'] = df_sa.apply(lambda row: sentiment_score(row,'SA_label_comment_text','SA_score_comment_text', 'Comment_Text'),axis=1)
df_sa['Sentiment_Score_reply'] = df_sa.apply(lambda row: sentiment_score(row,'SA_label_reply_text','SA_score_reply_text','Reply_Text'),axis=1)

In [13]:
# In path_input_csv replace _clean.csv with _sentiment.csv
path_output_csv = path_input_csv.replace('_clean.csv', '_sentiment.csv')

# Save to csv
df_sa.to_csv(path_output_csv, sep=';', index=False, encoding='utf-8-sig')

In [14]:
df_sa.head(10)

Unnamed: 0,series_genre,series_name,series_views,series_subscribers,series_rating,episode_ID_number,Episode_Name,Episode_URL,Episode_Date,Episode_Likes,...,Reply_Date,Reply_Author,Reply_Likes,Reply_Dislikes,SA_label_reply_text,SA_score_reply_text,SA_label_comment_text,SA_score_comment_text,Sentiment_Score_comment,Sentiment_Score_reply
0,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,2020-09-01 12:15:41+09:00,FruitInABasket,11.0,0.0,positive,0.401367,positive,0.843262,0.843262,0.401367
1,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,2020-08-11 10:54:49+09:00,Jamrar,262.0,1.0,neutral,0.771973,positive,0.814453,0.814453,0.0
2,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,2020-09-01 13:01:39+09:00,•○♧YeongHooky♧○•,83.0,0.0,neutral,0.838867,neutral,0.807129,0.0,0.0
3,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,2020-08-11 13:32:07+09:00,spanglepants,49.0,0.0,neutral,0.64209,neutral,0.663086,0.0,0.0
4,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,2020-09-01 12:25:30+09:00,mïźů ðä møøñ,2.0,0.0,neutral,0.739746,neutral,0.498291,0.0,0.0
5,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,,,,,,0.0,neutral,0.677246,0.0,
6,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,,,,,,0.0,neutral,0.566895,0.0,
7,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,,,,,,0.0,positive,0.910156,0.910156,
8,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,2020-09-03 03:10:33+09:00,cx_hana_xc,2.0,0.0,neutral,0.587402,negative,0.694336,-0.694336,0.0
9,Action,Weakhero,216300000,1400000,9.84,100,Ep. 100,https://www.webtoons.com/en/action/weakhero/ep...,2020-09-01,81653,...,,,,,,0.0,neutral,0.522949,0.0,
