In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from tqdm import tqdm  # For progress bar
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-11-15 13:53:26.238221: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-15 13:53:26.278640: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-15 13:53:26.291153: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-15 13:53:26.358415: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv('user_responses_final.csv', usecols=[
    'employeeID',
    'salary_response',
    'manager_response',
    'benefits_response',
    'career_response',
    'environment_response',
    'communication_response',
    'support_response',
    'recognition_response',
    'leadership_response',
    'remote_response',
    'worklife_balance'
])


In [4]:
# Check if GPU is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

# Move the model to the device (GPU or CPU)
model.to(device)


Using device: cuda


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [5]:
def compute_normalized_sentiment_score(text):
    if not str(text).strip():
        # Handle empty strings by assigning neutral sentiment
        return 0.5  # Normalized score for neutral sentiment
    
    try:
        # Tokenize the input text and move tensors to the device
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(device)
        
        # Get the model outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Apply softmax to get probabilities
        probs = F.softmax(outputs.logits, dim=-1)
        
        # Convert probabilities to numpy array
        probs = probs.detach().cpu().numpy()[0]
        
        # Verify that probabilities sum to 1
        prob_sum = probs.sum()
        if not 0.99 <= prob_sum <= 1.01:
            print(f"Warning: Probabilities do not sum to 1. Sum = {prob_sum}")
            probs = probs / prob_sum  # Normalize to sum to 1
        
        # Map probabilities to labels using indices
        Pnegative = probs[0]  # Corresponds to Negative
        Pneutral = probs[1]   # Corresponds to Neutral
        Ppositive = probs[2]  # Corresponds to Positive
        
        # Compute Sentiment Score
        sentiment_score = (Pnegative * 1) + (Pneutral * 2) + (Ppositive * 3)
        
        # Compute Normalized Sentiment Score
        normalized_sentiment_score = (sentiment_score - 1) / 2  # Should be between 0 and 1
        
        # Ensure the normalized score is within [0,1]
        normalized_sentiment_score = max(0, min(1, normalized_sentiment_score))
        
        return normalized_sentiment_score
    except Exception as e:
        print(f"Error processing text: {text}\nException: {e}")
        # Assign neutral score in case of error
        return 0.5


In [6]:
# Initialize empty lists to store normalized sentiment scores
salary_sentiment_scores = []
manager_sentiment_scores = []
benefits_sentiment_scores = []
career_sentiment_scores = []
environment_sentiment_scores = []
communication_sentiment_scores = []
support_sentiment_scores = []
recognition_sentiment_scores = []
leadership_sentiment_scores = []
remote_sentiment_scores = []
worlife_balance_sentiment_scores = []


# Iterate over each row with a progress bar
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Responses"):
    # Get each response
    salary_response = row['salary_response']
    manager_response = row['manager_response']
    benefits_response = row['benefits_response']
    career_response = row['career_response']
    environment_response = row['environment_response']
    communication_response = row['communication_response']
    support_response = row['support_response']
    recognition_response = row['recognition_response']
    leadership_response = row['leadership_response']
    remote_response = row['remote_response']
    worklife_balance_response = row['worklife_balance']
    
    # Compute normalized sentiment scores for each response
    salary_normalized = compute_normalized_sentiment_score(salary_response)
    manager_normalized = compute_normalized_sentiment_score(manager_response)
    benefits_normalized = compute_normalized_sentiment_score(benefits_response)
    career_normalized = compute_normalized_sentiment_score(career_response)
    environment_normalized = compute_normalized_sentiment_score(environment_response)
    communication_normalized = compute_normalized_sentiment_score(communication_response)
    support_normalized = compute_normalized_sentiment_score(support_response)
    recognition_normalized = compute_normalized_sentiment_score(recognition_response)
    leadership_normalized = compute_normalized_sentiment_score(leadership_response)
    remote_normalized = compute_normalized_sentiment_score(remote_response)
    worklife_balance_normalized = compute_normalized_sentiment_score(worklife_balance_response)

    salary_sentiment_scores.append(salary_normalized)
    manager_sentiment_scores.append(manager_normalized)
    benefits_sentiment_scores.append(benefits_normalized)
    career_sentiment_scores.append(career_normalized)
    environment_sentiment_scores.append(environment_normalized)
    communication_sentiment_scores.append(communication_normalized)
    support_sentiment_scores.append(support_normalized)
    recognition_sentiment_scores.append(recognition_normalized)
    leadership_sentiment_scores.append(leadership_normalized)
    remote_sentiment_scores.append(remote_normalized)
    worlife_balance_sentiment_scores.append(worklife_balance_normalized)


Processing Responses: 100%|██████████| 4519/4519 [04:56<00:00, 15.26it/s]


In [7]:
# Create a new DataFrame with the sentiment scores
sentiments_df = pd.DataFrame({
    'employeeID': df['employeeID'],
    'salary_sentiment': salary_sentiment_scores,
    'manager_sentiment': manager_sentiment_scores,
    'benefits_sentiment': benefits_sentiment_scores,
    'career_sentiment': career_sentiment_scores,
    'environment_sentiment': environment_sentiment_scores,
    'communication_sentiment': communication_sentiment_scores,
    'support_sentiment': support_sentiment_scores,
    'recognition_sentiment': recognition_sentiment_scores,
    'leadership_sentiment': leadership_sentiment_scores,
    'remote_sentiment': remote_sentiment_scores,
    'worklife_balance_sentiment': worlife_balance_sentiment_scores
})

# Display the first few rows to verify
sentiments_df.head()

Unnamed: 0,employeeID,salary_sentiment,manager_sentiment,benefits_sentiment,career_sentiment,environment_sentiment,communication_sentiment,support_sentiment,recognition_sentiment,leadership_sentiment,remote_sentiment,worklife_balance_sentiment
0,1,0.160105,0.421617,0.568891,0.919296,0.909902,0.989616,0.044831,0.987737,0.978631,0.052549,0.208987
1,2,0.942415,0.990034,0.988191,0.990024,0.990513,0.722745,0.045797,0.058123,0.439491,0.122708,0.103144
2,3,0.62751,0.341362,0.82477,0.351859,0.904192,0.954664,0.987931,0.601846,0.586047,0.981991,0.121754
3,4,0.269852,0.054303,0.062823,0.060268,0.077351,0.975878,0.983771,0.070521,0.981046,0.25536,0.106225
4,5,0.060565,0.035477,0.038455,0.087962,0.043418,0.205502,0.901055,0.058469,0.191105,0.388637,0.220065


In [8]:
sentiments_df['avg_sentiment'] = sentiments_df[
    [
        'salary_sentiment', 'manager_sentiment', 'benefits_sentiment', 'career_sentiment', 'environment_sentiment',
        'communication_sentiment', 'support_sentiment', 'recognition_sentiment', 'leadership_sentiment',
        'remote_sentiment', 'worklife_balance_sentiment'
    ]
].mean(axis=1)

In [9]:
# Save the sentiments DataFrame to a CSV file
sentiments_df.to_csv('sentiments_changed.csv', index=False)

print("Sentiment scores have been saved to 'sentiments.csv'.")

Sentiment scores have been saved to 'sentiments.csv'.


In [10]:
for col in [
        'salary_sentiment', 'manager_sentiment', 'benefits_sentiment', 'career_sentiment', 'environment_sentiment',
        'communication_sentiment', 'support_sentiment', 'recognition_sentiment', 'leadership_sentiment',
        'remote_sentiment', 'worklife_balance_sentiment', 'avg_sentiment'
    ]:
    if (sentiments_df[col] < 0).any() or (sentiments_df[col] > 1).any():
        print(f"Warning: {col} has values outside the [0, 1] range.")
    else:
        print(f"All values in {col} are within the [0, 1] range.")
        print(f"All values in {col} are within the [0, 1] range.")

All values in salary_sentiment are within the [0, 1] range.
All values in salary_sentiment are within the [0, 1] range.
All values in manager_sentiment are within the [0, 1] range.
All values in manager_sentiment are within the [0, 1] range.
All values in benefits_sentiment are within the [0, 1] range.
All values in benefits_sentiment are within the [0, 1] range.
All values in career_sentiment are within the [0, 1] range.
All values in career_sentiment are within the [0, 1] range.
All values in environment_sentiment are within the [0, 1] range.
All values in environment_sentiment are within the [0, 1] range.
All values in communication_sentiment are within the [0, 1] range.
All values in communication_sentiment are within the [0, 1] range.
All values in support_sentiment are within the [0, 1] range.
All values in support_sentiment are within the [0, 1] range.
All values in recognition_sentiment are within the [0, 1] range.
All values in recognition_sentiment are within the [0, 1] range