In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np


In [2]:
# Load the CSV file
csv_path = "steam_reviews.csv"
df = pd.read_csv(csv_path)

df.head()

Unnamed: 0,game_title,game_id,review,recommended
0,Counter-Strike 2,730,"This game is an absurd piece of crap, there ar...",False
1,Counter-Strike 2,730,"Arreglarlo, el juego esta en la mierda cheetos...",True
2,Counter-Strike 2,730,juego de mierd4 te odio si un dia un amigo les...,True
3,Counter-Strike 2,730,Vaya puta mierda de juego lleno de cheaters. V...,False
4,Counter-Strike 2,730,This game its imposible to play. Its full of c...,False


In [3]:
df["review"].values

array(["This game is an absurd piece of crap, there are cheaters in EVERY game, I paid $15 for nothing, because even in premier there are hackers, since they make thousands of dollars a week, at least invest in the anticheat, since it is the most useless resource in the entire game... If you don't believe me, check YouTube, people upload videos using hacks, they even promote them in the game chats. Basically the investment is $20, $15 for the premier which is useless, and $5 more for a cheat.",
       'Arreglarlo, el juego esta en la mierda cheetos y subnormales en todas las partidas esto es insufrible ayuda por favor',
       'juego de mierd4 te odio si un dia un amigo les dice que jueguen a esto directamente bloqueenlo.',
       ...,
       'Agregados que hacen que tengas un sin fin de  posibilidades en tus partidas',
       'Great dlc.', 'meh'], dtype=object)

In [4]:
# Initialize the model and tokenizer
model_name = "tabularisai/robust-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to predict sentiment for a single review
def predict_sentiment(review):
    if not isinstance(review, str) or review.strip() == "":
        return "Unknown"  # Handle non-string or empty reviews
    try:
        # Tokenize the input review
        inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=512)

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        # Get the predicted class (0 = negative, 1 = neutral, 2 = positive)
        predicted_class = torch.argmax(logits, dim=1).item()

        # Map the class to a sentiment label
        sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
        return sentiment_map[predicted_class]
    except Exception as e:
        print(f"Error processing review: {review[:50]}... Error: {e}")
        return "Unknown"

# Apply sentiment analysis to the 'review' column
df['sentiment'] = df['review'].apply(predict_sentiment)

# Save the results to a new CSV
output_path = "steam_reviews_with_sentiment.csv"
df.to_csv(output_path, index=False)
print(f"Sentiment analysis complete. Results saved to {output_path}")

# Display a summary of sentiment distribution
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

# Optional: Compare sentiment with 'recommended' column
print("\nSentiment vs Recommended:")
print(pd.crosstab(df['sentiment'], df['recommended']))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Sentiment analysis complete. Results saved to steam_reviews_with_sentiment.csv

Sentiment Distribution:
sentiment
Very Positive    6660
Neutral          5040
Positive         1856
Very Negative     322
Negative          174
Unknown            22
Name: count, dtype: int64

Sentiment vs Recommended:
recommended    False  True 
sentiment                  
Negative          61    113
Neutral          784   4256
Positive          92   1764
Unknown            1     21
Very Negative    127    195
Very Positive    408   6252


In [16]:
#Df columnas de sentiment que contengan 'Unknown'
df[df['sentiment'].str.contains('Very Negative')]


Unnamed: 0,game_title,game_id,review,recommended,sentiment
0,Counter-Strike 2,730,"This game is an absurd piece of crap, there ar...",False,Very Negative
11,Counter-Strike 2,730,"This game is an absurd piece of crap, there ar...",False,Very Negative
47,Counter-Strike 2,730,"Mal optimizado , anti cheat horrible , heat bo...",True,Very Negative
139,PUBG: BATTLEGROUNDS,578080,ALL CHEATERS,False,Very Negative
151,DOOM: The Dark Ages,3017860,¡LANCEN AL SLAYEEERRR!\n\nQue grato y magnific...,True,Very Negative
...,...,...,...,...,...
13980,KARMA: The Dark World,1376200,⠀⠀⠀⠀⠀⠀⠀⠀⢀⣠⣤⣶⣶⣶⣶⣶⣤⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀\n⠀⠀⠀⠀⠀⠀⣠⣴⣾⣿⣿⣿⣿⣿⣿⣿...,True,Very Negative
13982,Dead by Daylight - Steady Pulse,3698790,DEAD BY DAYLIGHT ES UNA MIERDA!\n\n**CHICAS SU...,True,Very Negative
14020,Arma 3,107410,"Sobrevalorado, completamente mal optimizado y ...",False,Very Negative
14033,Arma 3,107410,"MATTHEW, GET ON THAT DAMN MACHINE GUN ALREADY",True,Very Negative


In [15]:
from transformers import pipeline

model_path = "JonatanGk/roberta-base-bne-finetuned-cyberbullying-spanish"
bullying_analysis = pipeline("text-classification", model=model_path, tokenizer=model_path)

config.json:   0%|          | 0.00/911 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [17]:
bullying_analysis(
    "Racismo Espacial siempre es lo mejor"
    )

[{'label': 'Not_bullying', 'score': 0.9995548129081726}]