In [None]:
import sys
sys.path.append("F:\Programming\Projects\GlassDoor sentiment analysis\Phase 2\Src")

import data_preprocessing
from model import build_sentiment_model
from train import train_epoch, eval_model
from glassdoor_dataset_class import GlassdoorDataset

import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
# from transformers.optimization import AdamW


In [2]:
EPOCHS = 3
NUM_CLASSES = 2
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_SEQUENCE_LENGTH = 256
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_PATH = '..\\Models\\distilbert BERT model'

In [3]:
sentiment_data = pd.read_csv("..\..\\Data\\Sentiment_data_2.csv")
sentiment_data.head()

Unnamed: 0,headline,pros,cons,sentiment
0,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication.",Negative
1,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...,Negative
2,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very...",Negative
3,Over promised under delivered,Nice staff to work with,No career progression and salary is poor,Positive
4,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr...",Negative


In [4]:
sentiment_data['sentiment'].value_counts(normalize=1)

sentiment
Positive    0.840481
Negative    0.159519
Name: proportion, dtype: float64

## Data preprocessing

### compining headline, pros, cons into one column as full text

In [5]:
sentiment_data['text'] = data_preprocessing.compine_sentiment_text(sentiment_data)
print(sentiment_data['text'][0])

Young colleagues, poor micro management
Very friendly and welcoming to new staff. Easy going ethic.
Poor salaries, poor training and communication.


### Cleaning the generated text

In [6]:
sentiment_data['text'] = sentiment_data['text'].apply(data_preprocessing.clean_text)
print(sentiment_data['text'][0])

young colleagues poor micro management
very friendly and welcoming to new staff easy going ethic
poor salaries poor training and communication


### Encoding label

In [7]:
sentiment_data['sentiment_encoded'] = data_preprocessing.label_encoding(sentiment_data, label='sentiment')
sentiment_data.head()

Unnamed: 0,headline,pros,cons,sentiment,text,sentiment_encoded
0,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication.",Negative,young colleagues poor micro management\nvery f...,0
1,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...,Negative,excellent staff poor salary\nfriendly helpful ...,0
2,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very...",Negative,low salary bad micromanagement\neasy to get th...,0
3,Over promised under delivered,Nice staff to work with,No career progression and salary is poor,Positive,over promised under delivered\nnice staff to w...,1
4,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr...",Negative,client reporting admin\neasy to get the job ni...,0


### Data Sampling

In [8]:
sentiment_data_sample = sentiment_data.sample(n = 100000, random_state=42)

### Class weight for data balancing

In [9]:
# class_labels = np.unique(sentiment_data_sample['sentiment_encoded'])
# class_weights = compute_class_weight(
#     class_weight='balanced',
#     classes=class_labels,
#     y=sentiment_data_sample['sentiment_encoded']
# )
# class_weight_dict = dict(enumerate(class_weights))

# print(f"Calculated Class Weights: {class_weight_dict}")

### Spliting the data

In [10]:
X = sentiment_data_sample['text'].to_list()
y = sentiment_data_sample['sentiment_encoded'].to_list()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

### Tokenizing the text data

In [12]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)


In [13]:
train_dataset = GlassdoorDataset(X_train, y_train, tokenizer, MAX_SEQUENCE_LENGTH)
val_dataset = GlassdoorDataset(X_test, y_test, tokenizer, MAX_SEQUENCE_LENGTH)


In [14]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)



In [15]:
model = build_sentiment_model(PRE_TRAINED_MODEL_NAME, NUM_CLASSES).to(DEVICE)



In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)



In [17]:
class_weights = compute_class_weight(
    'balanced', classes=np.unique(sentiment_data_sample['sentiment_encoded']), y=sentiment_data_sample['sentiment_encoded']
)
loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float)).to(DEVICE)




In [18]:
best_accuracy = 0
for epoch in range(EPOCHS):
    print(f'--- Epoch {epoch + 1}/{EPOCHS} ---')
    train_loss, train_accuracy = train_epoch(model, train_loader, loss_fn, optimizer, DEVICE, scheduler)
    print(f'Train loss: {train_loss} - train accuracy: {train_accuracy}')

    val_acc = eval_model(model, val_loader, DEVICE)
    print(f'Val accuracy: {val_acc:.4f}')

    if val_acc > best_accuracy:
        os.makedirs(MODEL_PATH, exist_ok=True)

        # Save model weights
        torch.save(model.state_dict(), os.path.join(MODEL_PATH, "best_model.pt"))

        # Save tokenizer in Hugging Face format
        tokenizer.save_pretrained(MODEL_PATH)

        best_accuracy = val_acc
        print("Best model saved.")


print("\nTraining complete.")
print(f"Best validation accuracy: {best_accuracy:.4f}")
print(f"Model and tokenizer saved to {MODEL_PATH}")

--- Epoch 1/3 ---
Train loss: 0.4994711431757722 - train accuracy: 0.892575
Val accuracy: 0.9041
Best model saved.
--- Epoch 2/3 ---
Train loss: 0.43979871402326387 - train accuracy: 0.921325
Val accuracy: 0.9050
Best model saved.
--- Epoch 3/3 ---
Train loss: 0.3686004652203177 - train accuracy: 0.9441125
Val accuracy: 0.9004

Training complete.
Best validation accuracy: 0.9050
Model and tokenizer saved to ..\Models\distilbert BERT model
