In [1]:
!pip install sentence-transformers datasets transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.

In [2]:
import pandas as pd
import re


# read input document
X = pd.read_csv('/content/train_set.csv')
X = X[['feedback', 'label']]


X['feedback'] = X['feedback'].apply(lambda x: x.lower())
X['feedback'] = X['feedback'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))


df = pd.read_csv('/content/test_set.csv')
df = df[['feedback', 'label']]

df['feedback'] = df['feedback'].apply(lambda x: x.lower())
df['feedback'] = df['feedback'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))



V = pd.read_csv('/content/validation_set.csv')
V = V[['feedback', 'label']]

V['feedback'] = V['feedback'].apply(lambda x: x.lower())
V['feedback'] = V['feedback'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))



In [3]:
print("Train Set")
print(X.shape)
print(X['label'].value_counts())

print("Test Set")
print(df.shape)
print(df['label'].value_counts())

print("Validation Set")
print(V.shape)
print(V['label'].value_counts())

print("Concatenated Dataset")
dataset=pd.concat([X,df,V])
print(dataset.shape)

Train Set
(656, 2)
label
0    97
4    88
8    87
3    80
1    72
2    70
5    66
7    65
6    31
Name: count, dtype: int64
Test Set
(225, 2)
label
0    25
1    25
2    25
3    25
4    25
5    25
6    25
7    25
8    25
Name: count, dtype: int64
Validation Set
(116, 2)
label
0    17
4    16
8    16
3    14
1    13
2    12
5    12
7    11
6     5
Name: count, dtype: int64
Concatenated Dataset
(997, 2)


In [4]:

print(dataset['label'].value_counts())
print(dataset.info())
sampled_df = dataset.groupby('label').sample(n=150, random_state=1,replace=True,)

print(sampled_df['label'].value_counts())
print(sampled_df.info())


label
0    139
4    129
8    128
3    119
1    110
2    107
5    103
7    101
6     61
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 997 entries, 0 to 115
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   feedback  997 non-null    object
 1   label     997 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB
None
label
0    150
1    150
2    150
3    150
4    150
5    150
6    150
7    150
8    150
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 1350 entries, 37 to 589
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   feedback  1350 non-null   object
 1   label     1350 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.6+ KB
None


In [5]:
import random
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')


def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  # only replace up to n words
            break

    sentence = ' '.join(new_words)
    return sentence

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.add(l.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def random_insertion(sentence, n):
    words = sentence.split()
    for _ in range(n):
        new_word = get_synonyms(random.choice(words))
        if new_word:
            words.insert(random.randint(0, len(words)), random.choice(new_word))
    return ' '.join(words)

def random_swap(sentence, n):
    words = sentence.split()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def random_deletion(sentence, p):
    words = sentence.split()
    if len(words) == 1:  # return if single word
        return sentence

    new_words = []
    for word in words:
        if random.uniform(0, 1) > p:
            new_words.append(word)
    if len(new_words) == 0:  # ensure at least one word remains
        new_words.append(random.choice(words))
    return ' '.join(new_words)

# Example usage:
sentence = "This is a sample sentence for data augmentation."

sentence = "john has not progressed in his position he is continuously late leaves early and takes many breaks throughout the day he calls out at least every other week and its always on fridays his performance has significantly declined my suggestion is he is not suitable for this position"

print("Original Sentence:", sentence)
print("Synonym Replacement:", synonym_replacement(sentence, 10))
print("Random Insertion:", random_insertion(sentence, 10))
print("Random Swap:", random_swap(sentence, 10))
print("Random Deletion:", random_deletion(sentence, 0.4))




[nltk_data] Downloading package wordnet to /root/nltk_data...


Original Sentence: john has not progressed in his position he is continuously late leaves early and takes many breaks throughout the day he calls out at least every other week and its always on fridays his performance has significantly declined my suggestion is he is not suitable for this position
Synonym Replacement: john has non progressed Hoosier_State his position he equal continuously late leaves early_on and takes many breaks throughout the day he calls out At least every other week and information_technology invariably on Fri his carrying_out has significantly slump my suggestion equal he equal non suitable for this position
Random Insertion: john has not constantly progressed in his position he is continuously importantly late leaves early and takes many breaks throughout the day fall_apart he calls week recently out at least every other week and its always on fridays his performance leave_alone has significantly declined my suggestion is office he workweek is not suitable for 

In [6]:
import numpy as np
# Original data
original_sentences = list(sampled_df['feedback'].values)
original_labels = list(sampled_df['label'].values)

# Augmented data
augmented_sentences = []
for sentence in original_sentences:
    augmented_sentences.append(synonym_replacement(sentence, 10))
    augmented_sentences.append(random_insertion(sentence, 10))
    augmented_sentences.append(random_swap(sentence, 10))
    augmented_sentences.append(random_deletion(sentence, 0.3))

# Combine original and augmented data
all_sentences = original_sentences + augmented_sentences

# Repeat labels for the augmented sentences
augmented_labels = np.repeat(original_labels, 4)  # assuming each sentence generates 4 augmented versions
all_labels = np.concatenate((original_labels, augmented_labels))

print(len(all_sentences),len(all_labels))

6750 6750


In [7]:
m=0
for i in all_sentences:
  m=max(m,len(i.split()))
print("Maximum Words",m)

Maximum Words 129


In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(all_sentences,all_labels,test_size=0.3,random_state=50)

In [9]:
print("Training Samples",len(x_train))
print("Testing Samples",len(x_test))

Training Samples 4725
Testing Samples 2025


In [10]:
x_train[:5],y_train[:5]

(['people in heidis category give_the_axe sometimes be tough to judge she represent very originative and pose a lot of guess into her estimation and proposals however sometimes as in baseball if a player guide big swings for a home run they are prone to striking out still you cant teach creativity and international the box thinking so we would like to work with heidi on being a smarter risk taker and knowing when to hit 1 or take a walk when the situation warranty',
  'i am writing today to provide feedback on my colleague talon miller mr miller performs adequately at his job in the lab and completes the duties assigned to him however the quality of his work lacks',
  'morgan tail be in_force characterise as A middleoftheroad employee shes bear_witness both good and bad in her job functioning thus Former_Armed_Forces she has also flashed some possibility for advance single would rate her as solid overall but zippo especially impressive',
  'needs likewise as_well to retrain apply himse

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [13]:
from datasets import Dataset
train_df = pd.DataFrame({'text': x_train, 'label': y_train})
train_dataset = Dataset.from_pandas(train_df)
test_df = pd.DataFrame({'text': x_test, 'label': y_test})
test_dataset = Dataset.from_pandas(test_df)

In [14]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4725 [00:00<?, ? examples/s]

Map:   0%|          | 0/2025 [00:00<?, ? examples/s]

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [16]:
id2label = {
      0: "Risk (Low performance, Low potential)",
      1: "Average performer (Moderate performance, Low potential)",
      2: "Solid Performer (High performance, Low potential)",
      3: "Inconsistent Player (Low performance, Moderate potential)",
      4: "Core Player (Moderate performance, Moderate potential)",
      5: "High Performer (High performance, Moderate potential)",
      6: "Potential Gem (Low performance, High potential)",
      7: "High Potential (Moderate performance, High potential)",
      8: "Star (High performance, High potential)"
            }
label2id = {
      "Risk (Low performance, Low potential)":0,
      "Average performer (Moderate performance, Low potential)":1,
      "Solid Performer (High performance, Low potential)":2,
      "Inconsistent Player (Low performance, Moderate potential)":3,
      "Core Player (Moderate performance, Moderate potential)":4,
      "High Performer (High performance, Moderate potential)":5,
      "Potential Gem (Low performance, High potential)":6,
      "High Potential (Moderate performance, High potential)":7,
      "Star (High performance, High potential)":8
}

In [17]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [18]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=9, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [19]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_train,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_test,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [20]:
import tensorflow as tf

model.compile(optimizer=optimizer,metrics=['accuracy'])  # No loss argument!

In [21]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x78d4531eeb60>

In [24]:
model.save_pretrained('/content/drive/MyDrive/employee_retention_model_sampled150_70_30_5epochs_99acc_96val_acc')
tokenizer.save_pretrained('/content/drive/MyDrive/employee_retention_model_sampled150_70_30_5epochs_99acc_96val_acc')

('/content/drive/MyDrive/employee_retention_model_sampled150_70_30_5epochs_99acc_96val_acc/tokenizer_config.json',
 '/content/drive/MyDrive/employee_retention_model_sampled150_70_30_5epochs_99acc_96val_acc/special_tokens_map.json',
 '/content/drive/MyDrive/employee_retention_model_sampled150_70_30_5epochs_99acc_96val_acc/vocab.txt',
 '/content/drive/MyDrive/employee_retention_model_sampled150_70_30_5epochs_99acc_96val_acc/added_tokens.json',
 '/content/drive/MyDrive/employee_retention_model_sampled150_70_30_5epochs_99acc_96val_acc/tokenizer.json')

In [None]:
from transformers import AutoTokenizer,TFAutoModelForSequenceClassification
import tensorflow as tf
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/employee_retention_model_5epochs_99acc_97val_acc")
model = TFAutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/employee_retention_model_5epochs_99acc_97val_acc")

Some layers from the model checkpoint at /content/drive/MyDrive/employee_retention_model_5epochs_99acc_97val_acc were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/employee_retention_model_5epochs_99acc_97val_acc and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to us

In [None]:
text=input("Enter Feedback:")
from transformers import AutoTokenizer


inputs = tokenizer(text, return_tensors="tf")

from transformers import TFAutoModelForSequenceClassification

logits = model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
model.config.id2label[predicted_class_id]

Enter Feedback:Lacey was very tough to deal with. She never showed on time and did not perform well. There is some potential for improvement if she works harder. Low risk medium reward teammate


'Inconsistent Player (Low performance, Moderate potential)'