In [1]:
import pandas as pd
# Loading the dataset from the Excel file
file_path = 'Dataset_Sentiment_Analysis_Task.xlsx'
data = pd.read_excel(file_path)

# Displaying the first few rows of the dataset to understand its structure
print(data.head())
print(data.info())

                                   conversation_text sentiment
0  company_x এজেন্ট থেকে রং নাম্বারে টাকা গেলে ফে...   neutral
1  company_x গ্রাহকদের জন্য খুবই দুঃখ জনক একটা খব...  negative
2  company_x গ্রাহকদের জন্য খুবই দুঃখ জনক একটা খব...  negative
3  bank_y ব্যাংক যখন ঘোষনা দিয়েছে যে কার্ড দিয়ে ক...  negative
4  bank_y ব্যাংক যখন ঘোষনা দিয়েছে যে কার্ড দিয়ে ক...  negative
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   conversation_text  99 non-null     object
 1   sentiment          99 non-null     object
dtypes: object(2)
memory usage: 1.7+ KB
None


In [2]:
#importing necessary libraries 
import warnings
warnings.simplefilter("ignore")
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, KFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
#importing necessary libraries for Bangla Natural Language Processing Toolkit
#from banglanltk import word_tokenize, stemmer
from bnlp import CleanText, BengaliCorpus
from bnlp import NLTKTokenizer

# Initializing CleanText from bnlp
cleaner = CleanText(
   fix_unicode=True,
   unicode_norm=True,
   unicode_norm_form="NFKC",
   remove_url=False,
   remove_email=False,
   remove_emoji=False,
   remove_number=False,
   remove_digits=False,
   remove_punct=False,
   replace_with_url="<URL>",
   replace_with_email="<EMAIL>",
   replace_with_number="<NUMBER>",
   replace_with_digit="<DIGIT>",
   replace_with_punct="<PUNC>"
)

tokenizer = NLTKTokenizer()

# Defining the preprocessing function
def preprocess_text(text):
    
    # Cleaning the text using BNLP's CleanText
    cleaned_text = cleaner(text)
    
    # Further cleaning
    cleaned_text = re.sub(r'See Translation', '', cleaned_text, flags=re.IGNORECASE) # Remove unwanted phrases
    cleaned_text = re.sub(r'।+', '।', cleaned_text)  # Replace multiple '।' with a single one
    cleaned_text = re.sub(r',+', ',', cleaned_text)  # Replace multiple ',' with a single one
    cleaned_text = re.sub(r'\?+', '?', cleaned_text)   # Replace multiple '?' with a single one
    cleaned_text = re.sub(r'…+', '…', cleaned_text)  # Replace multiple '…' with a single one
    cleaned_text = re.sub(r'\.+', '', cleaned_text)    # Remove specific punctuation marks (.)

    # Removing digits and non-Bengali letters if needed
    # cleaned_text = re.sub(f"[{BengaliCorpus.digits}]", '', cleaned_text)  # Remove Bengali digits
    # cleaned_text = re.sub(f"[{BengaliCorpus.letter}]", '', cleaned_text)  # Remove Bengali letters if required

    # Tokenizing the text using BNLP's NLTKTokenizer
    tokens = tokenizer.sentence_tokenize(cleaned_text)
    
    # Removing stopwords
    stopwords = BengaliCorpus.stopwords if hasattr(BengaliCorpus, 'stopwords') else []
    tokens = [token for token in tokens if token.lower() not in stopwords] # Lowercase the tokens
    #tokens = [token.lower() for token in tokens]
    
    # Applying stemming
    # stemmed_tokens = [stemmer(token) for token in tokens] #not needed

    # Reassembling the text
    #Joining tokens into a single string                                       
    cleaned_text = ' '.join(tokens)    #cleaned_text = ' '.join(stemmed_tokens)
    
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() # Remove extra spaces
    
    return cleaned_text

# Applying preprocessing
data['processed_text'] = data['conversation_text'].apply(preprocess_text)

# Printing processed text by sentiment
sentiments = ['neutral', 'positive', 'negative']  # Adjust based on your data
for sentiment in sentiments:
    print(f"Sentiment: {sentiment}")
    subset = data[data['sentiment'] == sentiment]
    for index, row in subset.iterrows():
        print(f"Processed Text: {row['processed_text']}")
    print("\n" + "=" * 80 + "\n")

# Feature Engineering with TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['processed_text'])
y = LabelEncoder().fit_transform(data['sentiment'])

#Generating vocabulary size for LSTM's max_features (unique word count)
vocab_size = len(tfidf_vectorizer.get_feature_names_out())
print(f"Vocabulary Size: {vocab_size}")

# Models for comparison
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# TensorFlow/Keras for LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import History

punkt not found. downloading...


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aliah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sentiment: neutral
Processed Text: company_x এজেন্ট থেকে রং নাম্বারে টাকা গেলে ফেরত নেওয়ার কোন উপায় আছে? বি:দ্র: জিডি করা হয়ছে।যার কাছে টাকা গেছে তার ফোন বন্দ্ব।
Processed Text: আচ্ছা company_xের সিমটা অন্য মোবাইলে রেখে এ্যাপ্সের মাধ্যমে লেনদেন করার কোন অপশন আছে?
Processed Text: আমার company_xের বিনিময় একাউন্টের পিনটি ভুলে গেছি 🥺 পিনটি রিসেট করার কোন পদ্ধতি আছে? প্লিজ একটা সমাধান দিয়েন যারা জানেন
Processed Text: company_x Personal loan ta ki babe pabo 🤔🤔🤔
Processed Text: Ami company_x theke 10 hazar er ekta loan niyechi 1st kisti diyechi ekhn baki 2 ta kisti dile ki sthe sthe porer loan ta nite parbo 10 hazar er por koto loan dey
Processed Text: company_xে লোন কিভাবে পাবো? আমার পারসোনাল company_x থেকে তো প্রতি মাস এ পেমেন্ট হয় অনেক টাকা।
Processed Text: আসসালামু আলাইকুম নরমাল সিম থেকে ই-সিমে সিফট হলে কী company_x একাউন্টে কোন সমস্যা হয় ? আইফোন ফ্যাক্ট।
Processed Text: আমি কিভাবে company_xে লোন নিতে পারি
Processed Text: এক বছর বা তার বেশি সময় আমি যদি company_x একাউন্টে কোন ক্যাশ

In [3]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Ensuring X_train_text and X_test_text are lists
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(data['processed_text'], data['sentiment'], test_size=0.2, random_state=42)
X_train_text = X_train_text.tolist()
X_test_text = X_test_text.tolist()

In [5]:
!pip install unsloth
!pip install transformers datasets torch
!pip install transformers[torch]
!pip install accelerate -U



In [6]:
import torch
torch.cuda.is_available() #Checking if pytoch getting the access of cuda cores from env

True

In [7]:
from huggingface_hub import login
login(token="") #hidden for security

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\aliah\.cache\huggingface\token
Login successful


In [8]:
#imported "cognitivecomputations/dolphin-2.9.4-llama3.1-8b" model

In [9]:
# Importing necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Function to fine-tune the Dolphin 2.9.4 Llama 3.1 8b model
def fine_tune_llama(X_train, X_test, y_train, y_test):
    # Specifying the model name (Dolphin 2.9.4 Llama 3.1 8b)
    model_name = "cognitivecomputations/dolphin-2.9.4-llama3.1-8b"

    # Step 1: Loading the model and tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load the tokenizer for tokenizing text data
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(np.unique(y_train)))  # Load the model for classification
    except OSError as e:
        print(f"Error loading model or tokenizer: {e}")
        return None, None

    # Ensureing the inputs are lists of strings
    X_train = X_train.tolist() if isinstance(X_train, pd.Series) else X_train
    X_test = X_test.tolist() if isinstance(X_test, pd.Series) else X_test


    # Step 2: Tokenizing the training and test data
    train_encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt", max_length=512) # Tokenize training data
    val_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt", max_length=512) # Tokenize test data
 
    # Convert string labels to numerical format
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # Step 3: Creating custom dataset objects
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings  # Encoded input data
            self.labels = labels  # Labels for sentiment classification

        def __getitem__(self, idx):
            # Returns a dictionary of the tokenized input along with the label at the given index
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure label is of type long
            return item

        def __len__(self):
            # Returns the total number of samples in the dataset
            return len(self.labels)

     # Debugging: Checking if the lengths match
    print(f"Train encodings length: {len(train_encodings['input_ids'])}, Train labels length: {len(y_train_encoded)}")
    print(f"Val encodings length: {len(val_encodings['input_ids'])}, Val labels length: {len(y_test_encoded)}")

    # Step 4: Preparing datasets for training and validation
    train_dataset = CustomDataset(train_encodings, y_train_encoded)  # Training dataset
    val_dataset = CustomDataset(val_encodings, y_test_encoded)  # Validation dataset

    # Step 5: Defining training arguments
    training_args = TrainingArguments(
        output_dir='./results',  # Directory to save model checkpoints and results
        evaluation_strategy="epoch",  # Evaluating model at the end of each epoch
        per_device_train_batch_size=2,  # Training batch size per GPU/CPU
        per_device_eval_batch_size=2,  # Evaluation batch size per GPU/CPU
        num_train_epochs=3,  # Number of training epochs
        weight_decay=0.01,  # Weight decay for regularization
        gradient_accumulation_steps=4,  # Accumulating gradients over 4 steps
        fp16=True,  # Enabling mixed precision training
        no_cuda=True,  # Forcing training on CPU
    )

    # Step 6: Creating a Trainer instance for training and evaluation
    trainer = Trainer(
        model=model,  # The model to fine-tune
        args=training_args,  # Training arguments defined earlier
        train_dataset=train_dataset,  # Training dataset
        eval_dataset=val_dataset,  # Validation dataset
    )

    # Step 7: Training the model
    trainer.train()  # Fine-tuning the model using the provided dataset

    # Step 8: Evaluating the model on the test data
    eval_results = trainer.evaluate()  # Evaluating and getting evaluation results

    # Step 9: Extracting evaluation metrics (accuracy and classification report)
    accuracy = eval_results.get('eval_accuracy', None)  # Getting evaluation accuracy
    return accuracy  # Return the accuracy

In [None]:
# X contains the text data and y contains the sentiment labels
# Resetting indices after splitting to avoid KeyError issues
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(data['processed_text'], data['sentiment'], test_size=0.2, random_state=42)

# Resetting indices to ensure alignment
X_train_text.reset_index(drop=True, inplace=True)
X_test_text.reset_index(drop=True, inplace=True)
y_train_text.reset_index(drop=True, inplace=True)
y_test_text.reset_index(drop=True, inplace=True)

# Fine-tuning the model
llama_accuracy = fine_tune_llama(X_train_text, X_test_text, y_train_text, y_test_text)

print("Llama Model Accuracy:", llama_accuracy)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at cognitivecomputations/dolphin-2.9.4-llama3.1-8b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train encodings length: 79, Train labels length: 79
Val encodings length: 20, Val labels length: 20
