In [1]:
# ! pip install -U accelerate
# ! pip install -U transformers

In [1]:
# !pip3 install -U dagshub

In [None]:
import pandas as pd
import dagshub

from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

model_name = 'finiteautomata/beto-sentiment-analysis'
# model_name = 'pysentimiento/robertuito-sentiment-analysis'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

tokenizer = BertTokenizer.from_pretrained(model_name)

# Combine both model and tokenizer into a single classifier
classifier = pipeline('sentiment-analysis', tokenizer=tokenizer, model=model)

In [6]:
# # Get a boto3.client object
# s3 = get_repo_bucket_client("Omdena/IREX-El-Salvador-Sentiment")

In [7]:
# # Upload file
# s3.upload_file(
#     Bucket="IREX-El-Salvador-Sentiment",  # name of the repo
#     Filename="sentiment_beto_sample_bukele_updated.csv",  # local path of file to upload
#     Key="sentiment_beto_sample_bukele_updated.csv",  # remote path where to upload the file
# )

In [8]:
# mount_path = dagshub.storage.mount('Omdena/IREX-El-Salvador-Sentiment')
df = pd.read_csv('sentiment_beto_sample_bukele_updated.csv')

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,Post Type,Profile Name,Date,Time,Likes,Text,AP,EC Revised,AP + JA Revised,English Text (Google Translate)
0,0,Astronaut Post,Elizabeth Menjivar,2024-04-13,03:29:29,43,Dios lo bendiga por ser un gran ser humano,POS,POS,POS,God bless him for being a great human being.
1,1,Astronaut Post,Mila Mijango,2024-04-17,17:29:01,6,"Me encanta la humanidad de nuestro astronauta,...",POS,POS,POS,"I love the humanity of our astronaut, a man wi..."
2,2,Astronaut Post,Any Rodríguez,2024-04-13,04:00:29,18,Dos grandes hombres haciendo historia. Gracias...,POS,POS,POS,Two great men making history. Thank you for al...
3,3,Astronaut Post,Sara Cespedes,2024-04-13,08:22:47,136,Nayib y Frank dos grandes ejemplo de umildad y...,POS,POS,POS,"Nayib and Frank, two great examples of humilit..."
4,4,Astronaut Post,Martha Morales,2024-04-13,03:13:56,70,A Colombia le falta un precidente con este ...,NEU,POS,POS,"Colombia lacks a precedent with this one, I am..."


In [10]:
df.isnull().sum()

Unnamed: 0                         0
Post Type                          0
Profile Name                       0
Date                               0
Time                               0
Likes                              0
Text                               0
AP                                 0
EC Revised                         4
AP + JA Revised                    0
English Text (Google Translate)    0
dtype: int64

In [22]:
# Pick the agreed upon labels
df['sentiment'] = df['AP + JA Revised']

# Display the first few rows of the DataFrame
df.head(2)

# Create sentiment labels in an unsupervised manner with the BETO sentiment analysis classifier

# Creating lists for a new dataframe that contains the assigned label and the associated probability score
sentiment_output = []
sentiment_proba = []

# Looping through the scraped tweets and appending predictions to the previously created lists

for tweet in df['Text']:
  result = classifier(tweet)
  sentiment_output.append(result[0]['label']) # select given label
  sentiment_proba.append(result[0]['score']) # select probability of given label

# Concat results with selected columns to get a new dataframe
sentiment_beto_df = pd.concat([df,
                               pd.Series(sentiment_output), pd.Series(sentiment_proba)],
                              axis=1)

# Rename new columns to 'sentiment' and 'sentiment_probability'
sentiment_beto_df.rename(columns={0: 'sentiment_output', 1: 'sentiment_probability'}, inplace=True)

In [23]:
sentiment_beto_df.head()

Unnamed: 0.1,Unnamed: 0,Post Type,Profile Name,Date,Time,Likes,Text,AP,EC Revised,AP + JA Revised,English Text (Google Translate),sentiment_numeric,sentiment,sentiment_output,sentiment_probability
0,0,Astronaut Post,Elizabeth Menjivar,2024-04-13,03:29:29,43,Dios lo bendiga por ser un gran ser humano,POS,POS,POS,God bless him for being a great human being.,0,POS,POS,0.997756
1,1,Astronaut Post,Mila Mijango,2024-04-17,17:29:01,6,"Me encanta la humanidad de nuestro astronauta,...",POS,POS,POS,"I love the humanity of our astronaut, a man wi...",0,POS,POS,0.99878
2,2,Astronaut Post,Any Rodríguez,2024-04-13,04:00:29,18,Dos grandes hombres haciendo historia. Gracias...,POS,POS,POS,Two great men making history. Thank you for al...,0,POS,POS,0.998589
3,3,Astronaut Post,Sara Cespedes,2024-04-13,08:22:47,136,Nayib y Frank dos grandes ejemplo de umildad y...,POS,POS,POS,"Nayib and Frank, two great examples of humilit...",0,POS,POS,0.99877
4,4,Astronaut Post,Martha Morales,2024-04-13,03:13:56,70,A Colombia le falta un precidente con este ...,NEU,POS,POS,"Colombia lacks a precedent with this one, I am...",0,POS,POS,0.984938


In [24]:
# Evaluate results
y = sentiment_beto_df['sentiment']
y_pred = sentiment_beto_df['sentiment_output']

In [25]:
accuracy = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred, average='weighted')
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
print(accuracy, f1, precision, recall)

0.8618618618618619 0.8762998299205096 0.9106828026182864 0.8618618618618619


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import mlflow

In [20]:
# Log results to MLFlow
# Initialize DagsHub for MLflow tracking
dagshub.init("IREX-El-Salvador-Sentiment", "Omdena", mlflow=True)

In [21]:
clf_name = "BETO Pretrained"
# Log the experiment with MLflow and DagsHub
with mlflow.start_run(run_name=clf_name):
    # Log parameters
    # mlflow.log_params(clf.get_params())
    mlflow.log_param("Classifier", clf_name)

    # Log metrics
    # mlflow.log_metric("Training Accuracy", train_accuracy)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision_score", precision)
    mlflow.log_metric("recall_score", recall)

    # Log the model
    # mlflow.sklearn.log_model(clf, f'{clf_name}_model')

    # Add tags
    mlflow.set_tag("estimator_name", clf_name)
    mlflow.set_tag("transformer", "BETO")