In [1]:
import torch

In [2]:
import polars as pl
import gc
import pandas as pd
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification 

In [3]:
df_lazy = (pl.scan_csv('cleaned_amazon_reviews.csv')  # Lazy loading the CSV
           .filter(pl.col('language') == 'english')  # Filter English reviews
           .drop_nulls(subset=['cleaned_text', 'title'])  # Drop null values
           .unique(subset=['cleaned_text'])  # Drop duplicates based on 'cleaned_text'
           .with_columns(pl.col('label').replace({2: 1, 1: 0}))  # Replace label values
           .select(['text', 'label'])  # Select relevant columns
          )

In [4]:
df = df_lazy.collect()
gc.collect()

0

In [5]:
df_pandas = df.to_pandas()

In [6]:
dataset = Dataset.from_pandas(df_pandas)

In [7]:
del df_pandas 
del df
gc.collect()

5

In [8]:
train_test_split = dataset.train_test_split(test_size=0.3)
val_test_split = train_test_split['test'].train_test_split(test_size=2/3)
final_splits = {
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
}

In [12]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return distilbert_tokenizer(examples['text'], truncation=True, padding='max_length')
tokenized_datasets = {
    'train': final_splits['train'].map(tokenize_function, batched=True),
    'validation': final_splits['validation'].map(tokenize_function, batched=True),
    'test': final_splits['test'].map(tokenize_function, batched=True)
}

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/2787731 [00:00<?, ? examples/s]

Map:   0%|          | 0/398247 [00:00<?, ? examples/s]

Map:   0%|          | 0/796496 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets['train'].save_to_disk('datasets/train')

Saving the dataset (0/17 shards):   0%|          | 0/2787731 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets['test'].save_to_disk('datasets/test')

Saving the dataset (0/5 shards):   0%|          | 0/796496 [00:00<?, ? examples/s]

In [15]:
tokenized_datasets['validation'].save_to_disk('datasets/validation')

Saving the dataset (0/3 shards):   0%|          | 0/398247 [00:00<?, ? examples/s]

In [16]:
for split in tokenized_datasets:
    tokenized_datasets[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [19]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # Adjust num_labels as needed

# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16, #Might have done it in 32 but this would use shared memory which destroys the performance.
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the hugging face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=distilbert_tokenizer
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1528,0.119227


TrainOutput(global_step=174234, training_loss=0.13886891061075074, metrics={'train_runtime': 67534.089, 'train_samples_per_second': 41.279, 'train_steps_per_second': 2.58, 'total_flos': 3.6928347372268954e+17, 'train_loss': 0.13886891061075074, 'epoch': 1.0})

In [21]:
trainer.save_model(r"D:\PythonProjects\DEPI Grad Project\FinetunedModel")

In [23]:
# Step 11: Evaluate the model on the test dataset
test_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'])
# Step 12: Print the test results
print(test_results)

{'eval_loss': 0.11957396566867828, 'eval_runtime': 5763.0611, 'eval_samples_per_second': 138.207, 'eval_steps_per_second': 8.638, 'epoch': 1.0}


In [24]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [35]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda")
sentence = "I liked this work more than anything but it had a couple flaws"

# Tokenize the input sentence
inputs = distilbert_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

# Move the inputs to the correct device (CPU or GPU)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Make predictions
with torch.no_grad():  # No need to calculate gradients during inference
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

# Convert logits to probabilities
probs = torch.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probs, dim=-1).item()

In [36]:
labels = ["Negative", "Positive"]  # Example for binary classification
predicted_label = labels[predicted_class]

print(f"Predicted label: {predicted_label}")

Predicted label: Positive


In [38]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import  MarianMTModel, MarianTokenizer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import load_dataset
import torch
import sounddevice as sd
import scipy.io.wavfile as wav
import soundfile as sf
import numpy as np
import whisper
from deep_translator import GoogleTranslator

translator = GoogleTranslator(source='ar', target='en')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

whisper_model = whisper.load_model("medium").to(device)
model_directory = r"D:\PythonProjects\DEPI Grad Project\FinetunedModel"
tokenizer_bert = DistilBertTokenizer.from_pretrained(model_directory)
model_bert = DistilBertForSequenceClassification.from_pretrained(model_directory).to(device)

  checkpoint = torch.load(fp, map_location=device)


In [39]:
def record_audio(save_output=False, output_filename='recorded_audio.wav', duration=5):
    fs = 16000  # Sample rate (Whisper requires 16kHz)
    print("Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
    sd.wait()  # Wait for the recording to finish
    print("Recording finished")
    if save_output:
        wav.write(output_filename, fs, (audio * 32767).astype(np.int16))
    return audio

In [40]:
def transcribe(audio, model_size="medium"):
    audio = np.squeeze(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device) # whisper (and most ASR models) uses log scaled mel spectrogram 
    options = whisper.DecodingOptions(language="en")  # Specify 'en' for English transcription
    result = whisper.decode(whisper_model, mel, options)
    return result.text  # Return only the transcribed text

In [46]:
def classify_text(transcription):
    if(any('\u0600' <= char <= '\u06FF' for char in transcription)):
        transcription = translator.translate(transcription)
    inputs = tokenizer_bert(transcription, return_tensors="pt", truncation=True, padding=True)
    
    # Move inputs to GPU if available
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model_bert(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1)
    
    # Define class labels (adjust as per your fine-tuned model's output)
    labels = ['negative', 'positive']
    return labels[predicted_class.item()]

In [47]:
audio = record_audio(3)
trans = transcribe(audio = audio)

Recording...
Recording finished


In [53]:
df['content'] = df['content'].str[0:-10]

In [54]:
df['sentiment'] = df['content'].apply(classify_text)