In [None]:
import torch
print(torch.__version__)
import pandas as pd
import numpy as np
from datasets import Dataset
import os
# specify your filename
out_dir = "Models/"
model_name = "DevSEmo/emotion_SE_distilroberta_base"

from huggingface_hub import login
from dotenv import load_dotenv
import os
credentials_filename = "credentials.txt"
# Read the token from the file
token = None
with open(credentials_filename, "r") as file:
    for line in file:
        if "HF_HUB_TOKEN" in line:
            token = line.split("=")[1].strip()
            break
# Check if a token was found
if token is None:
    raise ValueError("HF_HUB_TOKEN not found in credentials.txt")
# Login to the Hugging Face Hub
login(token=token)
print("Logged into huggingface")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7)

training hyperparmeters


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(output_dir = out_dir)

Evaluate

In [None]:
from transformers import TrainingArguments, Trainer
# Define the training arguments
training_args = TrainingArguments(
    output_dir=out_dir,
    save_strategy="epoch",
#     save_steps=1000,  # Save model every 100 steps
    num_train_epochs=1,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
)

trainer = Trainer(
    model=model,
    args=training_args,
)

In [None]:

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(out_dir+"/")

# # Load the tokenizer associated with the model
# tokenizer = AutoTokenizer.from_pretrained(out_dir+"/")

# from transformers import RobertaTokenizerFast, RobertaModel
# load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)

tp = ["Issues"]  #"PRs" , "Commits", 
languages = ["Shell", "Makefile", "C", "Kotlin", "Python", "HCL", "Rust", "TypeScript", "Lua", "Go"]
for this_tp in tp:
    print("Running for type: ", this_tp )
    for lang in languages:
        print("I'm working on langauge: ", lang)
        input_file_name = "Comments/input/"+lang+"_"+this_tp+"_split.csv"
        output_file_name = "Comments/output/"+lang+"_"+this_tp+"_RoBERTaAnnotated.csv"  # name your output file

        text_column = "Comment"  # select the column in your csv that contains the text to be classified

        # read in csv
        df_pred = pd.read_csv(input_file_name, encoding='utf-8')
        pred_texts = df_pred[text_column].dropna().astype('str').tolist()

        # Tokenize texts and create prediction data set
        tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
        pred_dataset = SimpleDataset(tokenized_texts)

        # Run predictions
        predictions = trainer.predict(pred_dataset)

        # Transform predictions to labels
        preds = predictions.predictions.argmax(-1)
        labels = pd.Series(preds).map(model.config.id2label)
        scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

        # scores raw
        temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

        # work in progress
        # container
        anger = []
        disgust = []
        fear = []
        joy = []
        neutral = []
        sadness = []
        surprise = []

        # extract scores (as many entries as exist in pred_texts)
        for i in range(len(pred_texts)):
          anger.append(temp[i][0])
          disgust.append(temp[i][1])
          fear.append(temp[i][2])
          joy.append(temp[i][3])
          neutral.append(temp[i][4])
          sadness.append(temp[i][5])
          surprise.append(temp[i][6])

        # Create DataFrame with texts, predictions, labels, and scores
        df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
        df.head()
        # save results to csv
        df.to_csv(output_file_name, encoding='utf-8', index=False)


In [None]:
##Only for Go Issues
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(out_dir+"/")

# # Load the tokenizer associated with the model
# tokenizer = AutoTokenizer.from_pretrained(out_dir+"/")

# from transformers import RobertaTokenizerFast, RobertaModel
# load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)

import pandas as pd
import os

# Splitting the input file into three parts
input_file_name = "Comments/input/Go_Issues_split.csv"
df = pd.read_csv(input_file_name, encoding='utf-8')
part_size = len(df) // 3
file_parts = []

for i in range(3):
    part_file_name = input_file_name.replace(".csv", f"_part_{i + 1}.csv")
    df.iloc[i * part_size:(i + 1) * part_size].to_csv(part_file_name, index=False)
    file_parts.append(part_file_name)

# Initialize the all_dfs list to store processed parts
all_dfs = []

tp = ["Issues"]
languages = ["Go"]

for this_tp in tp:
    for lang in languages:
        for part_file in file_parts:
            print(f"Running for type: {this_tp}, language: {lang} on {part_file}")

            # Update the input_file_name to be the current part_file
            input_file_name = part_file
            output_file_name = part_file.replace("input", "output")

            text_column = "Comment"  # select the column in your csv that contains the text to be classified

            # read in csv
            df_pred = pd.read_csv(input_file_name, encoding='utf-8')
            pred_texts = df_pred[text_column].dropna().astype('str').tolist()

            # Tokenize texts and create prediction data set
            tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
            pred_dataset = SimpleDataset(tokenized_texts)

            # Run predictions
            predictions = trainer.predict(pred_dataset)

            # Transform predictions to labels
            preds = predictions.predictions.argmax(-1)
            labels = pd.Series(preds).map(model.config.id2label)
            scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

            # scores raw
            temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

            # work in progress
            # container
            anger = []
            disgust = []
            fear = []
            joy = []
            neutral = []
            sadness = []
            surprise = []

            # extract scores (as many entries as exist in pred_texts)
            for i in range(len(pred_texts)):
              anger.append(temp[i][0])
              disgust.append(temp[i][1])
              fear.append(temp[i][2])
              joy.append(temp[i][3])
              neutral.append(temp[i][4])
              sadness.append(temp[i][5])
              surprise.append(temp[i][6])

            # Create DataFrame with texts, predictions, labels, and scores
            df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
            df.head()
            # save results to csv
            df.to_csv(output_file_name, encoding='utf-8', index=False)
            all_dfs.append(df)
# Merge processed results
final_output = "Comments/output/Go_Issues_RoBERTaAnnotated.csv"
final_df = pd.concat(all_dfs)
final_df.to_csv(final_output, index=False)

# Clean up intermediate files (optional)
for f in file_parts:
    os.remove(f)
    os.remove(f.replace("input", "output"))

In [None]:
#For RQ3: Partitions
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(out_dir+"/")

# # Load the tokenizer associated with the model
# tokenizer = AutoTokenizer.from_pretrained(out_dir+"/")

# from transformers import RobertaTokenizerFast, RobertaModel
# load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)

partitions = ["Partition1/"] #"Partition3/", "Partition2/", 
languages = ["Shell", "Makefile", "C", "Kotlin", "Python", "HCL", "Rust", "TypeScript"]#, "Lua", "Go"]
for this_part in partitions:
    print("Running for partition: ", this_part )
    for lang in languages:
        print("I'm working on langauge: ", lang)
        input_file_name = "Comments/input/"+this_part+lang+"_split.csv"
        output_file_name = "Comments/output/"+this_part+lang+"_RoBERTaAnnotated.csv"  # name your output file

        text_column = "Comment"  # select the column in your csv that contains the text to be classified

        # read in csv
        df_pred = pd.read_csv(input_file_name, encoding='utf-8')
        pred_texts = df_pred[text_column].dropna().astype('str').tolist()

        # Tokenize texts and create prediction data set
        tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
        pred_dataset = SimpleDataset(tokenized_texts)

        # Run predictions
        predictions = trainer.predict(pred_dataset)

        # Transform predictions to labels
        preds = predictions.predictions.argmax(-1)
        labels = pd.Series(preds).map(model.config.id2label)
        scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

        # scores raw
        temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

        # work in progress
        # container
        anger = []
        disgust = []
        fear = []
        joy = []
        neutral = []
        sadness = []
        surprise = []

        # extract scores (as many entries as exist in pred_texts)
        for i in range(len(pred_texts)):
          anger.append(temp[i][0])
          disgust.append(temp[i][1])
          fear.append(temp[i][2])
          joy.append(temp[i][3])
          neutral.append(temp[i][4])
          sadness.append(temp[i][5])
          surprise.append(temp[i][6])

        # Create DataFrame with texts, predictions, labels, and scores
        df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
        df.head()
        # save results to csv
        df.to_csv(output_file_name, encoding='utf-8', index=False)


In [None]:
##Only for Go Issues
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(out_dir+"/")

# # Load the tokenizer associated with the model
# tokenizer = AutoTokenizer.from_pretrained(out_dir+"/")

# from transformers import RobertaTokenizerFast, RobertaModel
# load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)

import pandas as pd
import os

partitions = ["Partition1/"] #"Partition3/", "Partition2/", 
languages = ["Go"] #"Lua", 
for this_part in partitions:
    print("Running for partition: ", this_part )
    for lang in languages:
        print("I'm working on langauge: ", lang)
        inp_file_name = "Comments/input/"+this_part+lang+"_split.csv"
        out_file_name = "Comments/output/"+this_part+lang+"_RoBERTaAnnotated.csv"

        # Splitting the input file into three parts
        df = pd.read_csv(inp_file_name, encoding='utf-8')
        part_size = len(df) // 10
        file_parts = []

        for i in range(10):
            part_file_name = inp_file_name.replace(".csv", f"_part_{i + 1}.csv")
            df.iloc[i * part_size:(i + 1) * part_size].to_csv(part_file_name, index=False)
            file_parts.append(part_file_name)

        # Initialize the all_dfs list to store processed parts
        all_dfs = []
        
        for part_file in file_parts:
            print(f"Running for type: {this_part}, language: {lang} on {part_file}")

            # Update the input_file_name to be the current part_file
            input_file_name = part_file
            output_file_name = part_file.replace("input", "output")

            text_column = "Comment"  # select the column in your csv that contains the text to be classified

            # read in csv
            df_pred = pd.read_csv(input_file_name, encoding='utf-8')
            pred_texts = df_pred[text_column].dropna().astype('str').tolist()

            # Tokenize texts and create prediction data set
            tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
            pred_dataset = SimpleDataset(tokenized_texts)

            # Run predictions
            predictions = trainer.predict(pred_dataset)

            # Transform predictions to labels
            preds = predictions.predictions.argmax(-1)
            labels = pd.Series(preds).map(model.config.id2label)
            scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

            # scores raw
            temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

            # work in progress
            # container
            anger = []
            disgust = []
            fear = []
            joy = []
            neutral = []
            sadness = []
            surprise = []

            # extract scores (as many entries as exist in pred_texts)
            for i in range(len(pred_texts)):
              anger.append(temp[i][0])
              disgust.append(temp[i][1])
              fear.append(temp[i][2])
              joy.append(temp[i][3])
              neutral.append(temp[i][4])
              sadness.append(temp[i][5])
              surprise.append(temp[i][6])

            # Create DataFrame with texts, predictions, labels, and scores
            df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
            df.head()
            # save results to csv
            df.to_csv(output_file_name, encoding='utf-8', index=False)
            all_dfs.append(df)
        # Merge processed results
        final_df = pd.concat(all_dfs)
        final_df.to_csv(out_file_name, index=False)

        # Clean up intermediate files (optional)
        for f in file_parts:
            os.remove(f)
            os.remove(f.replace("input", "output"))

In [None]:
import os
import pandas as pd

# Initialize an empty list to store the results
results = []

partitions = ["Partition3", "Partition2", "Partition1"]
# Loop through files in the directory
for this_partition in partitions:
    for filename in os.listdir('Comments/output/'+this_partition):
        # Check if the file matches the desired pattern
        if "_RoBERTaAnnotated.csv" in filename:
            lang, _ = filename.split("_")

            # Read the CSV file into a dataframe
            df = pd.read_csv(os.path.join('Comments/output/'+this_partition, filename))

            # Ensure the file has the necessary columns before processing
            necessary_columns = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
            if all(col in df.columns for col in necessary_columns):
                # Calculate the averages
                avg_anger = df['anger'].mean()
                avg_disgust = df['disgust'].mean()
                avg_fear = df['fear'].mean()
                avg_joy = df['joy'].mean()
                avg_neutral = df['neutral'].mean()
                avg_sadness = df['sadness'].mean()
                avg_surprise = df['surprise'].mean()

                # Append the results to the results list
                results.append([lang, this_partition, len(df), avg_anger, avg_disgust, avg_fear, avg_joy, avg_neutral, avg_sadness, avg_surprise])
            else:
                print(f"Warning: File {filename} is missing some of the necessary columns and was skipped.")

# Convert the results to a dataframe
columns = ["Language", "Partition", "#rows","anger", "disgust","fear","joy","neutral","sadness","surprise"]
df_results = pd.DataFrame(results, columns=columns)

# Write the results to a new CSV file
df_results.to_csv("lang_Summary_TimeSeries_RoBERTa.csv", index=False)
