In [None]:
#Split the CSV inot smaller batch sizes for sentiment analysis or analysis with OPENAI API
import pandas as pd

df = pd.read_csv("...csv")

# Set batch size
batch_size = 300 #Adjust this based on your needs

for i in range(0, len(df), batch_size):
    batch_df = df.iloc[i:i+batch_size]
    batch_df.to_csv(f"...{i+1}-{i+len(batch_df)}.csv", index=False)


In [None]:
#Sentiment - CryptoBERT
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
import requests
from bs4 import BeautifulSoup
import pandas as pd

model_name = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, max_length=64, truncation=True, padding='max_length')

# Load CSVs
df_1 = pd.read_csv("...csv") #vietnamese versions
#...
df_a = pd.read_csv("...csv") #english versions
#...

def sentiment(filename, language_content):
  # Create new columns for sentiment and confidence
  filename['sentiment'] = None
  filename['confidence'] = None
  for i, content in enumerate(filename[language_content]):
    if content != "No content":
      result = pipe(content)[0]
      # title = filename["title"][i]
      # print("")
      # print(f"Title: {title}")
      sentiment = result['label']
      confidence = round(result['score'], 4)
      filename.loc[i, 'sentiment'] = sentiment
      filename.loc[i, 'confidence'] = confidence
      # print(f"Sentiment: {sentiment}, Confidence: {confidence:.4f}")
  return filename

df_analyzed_1 = sentiment(df_1, "content") # Vietnamese csvs
df_analyzed_1.to_csv("...csv", index=False)
#...

df_analyzed_a = sentiment(df_a, "content_en") # English csvs
df_analyzed_a.to_csv("...csv", index=False)
#...

In [None]:
# Sentiment - FinBERT
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd

model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
pipe_finbert = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, truncation=True, padding='max_length', max_length=512)

# Load CSVs
df_1 = pd.read_csv("...csv") #Vietnamese CSVs
#...
df_a = pd.read_csv("...csv") #English CSVs
#...

def sentiment(df, language_content):
  df['sentiment'] = None
  df['weight'] = None

  for i, content in enumerate(df[language_content]):
    if isinstance(content, str) and content.strip():
      result = pipe_finbert(content)[0]
      best = max(result, key=lambda r: r['score'])

      df.at[i, 'sentiment'] = best['label']
      df.at[i, 'weight'] = round(best['score'], 4)
    else:
      df.at[i, 'sentiment'] = 'unknown'
      df.at[i, 'weight'] = 0.0

  return df

df_analyzed_1 = sentiment(df_1, "content_en") #English CSVs
df_analyzed_1.to_csv("sentiment_tuoitre_bit_en.csv", index=False)
#...
df_analyzed_a = sentiment(df_a, "content") #Vietnamese CSVs
df_analyzed_a.to_csv("sentiment_tuoitre_block_vn.csv", index=False)
#...

In [None]:
# Sentiment - Wonrax PhoBERT
import torch
import pandas as pd
from pyvi import ViTokenizer
from transformers import RobertaForSequenceClassification, AutoTokenizer

model_name = "wonrax/phobert-base-vietnamese-sentiment"
model = RobertaForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
labels = ["negative", "positive", "neutral"]

def classify_sentiment(text):
    try:
        segmented = ViTokenizer.tokenize(text)
        inputs = tokenizer(segmented, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = logits.softmax(dim=-1).tolist()[0]
            max_idx = int(torch.argmax(logits))
            return labels[max_idx], round(probs[max_idx], 4)
    except:
        return "ERROR", 0.0

def process_csv(input_path, output_path, content_column="content"):
    df = pd.read_csv(input_path)
    sentiments, scores = [], []
    for content in df[content_column]:
        if isinstance(content, str) and content.strip() and content != "No content":
            label, score = classify_sentiment(content)
        else:
            label, score = "No content", 0.0
        sentiments.append(label)
        scores.append(score)
    df["sentiment"] = sentiments
    df["score"] = scores
    df.to_csv(output_path, index=False)
    # print(f"Processed: {input_path} into {output_path}")

# Vietnamese CSVs
process_csv("...", "...", content_column="content")
#...

In [None]:
# Sentiment - Mr4 PhoBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

model_name = "mr4/phobert-base-vi-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def classify_sentiment(text):
    try:
        if not isinstance(text, str) or text.strip().lower() in ["", "no content"]:
            return "No content", 0.0
        inputs = tokenizer([text], padding=True, truncation=True, return_tensors="pt", max_length=256)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
        label_id = torch.argmax(probs).item()
        label = model.config.id2label[label_id] if hasattr(model.config, "id2label") else f"LABEL_{label_id}"
        return label, round(probs[label_id].item(), 4)
    except Exception as e:
        print(f"Error at text: {text[:50]}... | {e}")
        return "ERROR", 0.0

def process_csv(input_path, output_path, content_column="content"):
    df = pd.read_csv(input_path)
    df["sentiment"] = ""
    df["score"] = 0.0

    for i, row in df.iterrows():
        sentiment, score = classify_sentiment(row[content_column])
        df.at[i, "sentiment"] = sentiment
        df.at[i, "score"] = score

    df.to_csv(output_path, index=False)
    # print(f"Processed: {input_path} into {output_path}")

# Vietnamese CSVs
process_csv("...csv", "...csv")
#...