In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict
import torch




In [4]:
df = pd.read_csv(r'C:\Users\dell\Desktop\MyDocs\Docs\MK\Dataset-SA.csv')  # Replace with your actual filename

In [5]:
df.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205052 entries, 0 to 205051
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_name   205052 non-null  object
 1   product_price  205052 non-null  object
 2   Rate           205052 non-null  object
 3   Review         180388 non-null  object
 4   Summary        205041 non-null  object
 5   Sentiment      205052 non-null  object
dtypes: object(6)
memory usage: 9.4+ MB


In [7]:
df.describe()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
count,205052,205052,205052,180388,205041,205052
unique,958,525,8,1324,92923,3
top,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,wonderful,good,positive
freq,6005,9150,118765,9016,17430,166581


In [8]:
df['Rate'] = pd.to_numeric(df['Rate'], errors='coerce')
df.dropna(subset=['Review', 'Rate'], inplace=True)

In [9]:
def map_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating <= 2:
        return "negative"
    else:
        return "neutral"

df['Sentiment'] = df['Rate'].apply(map_sentiment)

In [10]:
df = df[['Review', 'Sentiment']]
df.columns = ['text', 'label']

# Print label distribution
print(df['label'].value_counts())

label
positive    142616
negative     23745
neutral      14024
Name: count, dtype: int64


In [11]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

In [12]:
tokenized_text = tokenizer(
    df['text'].tolist(),
    padding="max_length",
    truncation=True,
    max_length=128,  # Adjust if necessary
    return_tensors="pt"
)

In [13]:
tokenized_data = df.copy()
tokenized_data['input_ids'] = [t.tolist() for t in tokenized_text['input_ids']]
tokenized_data['attention_mask'] = [t.tolist() for t in tokenized_text['attention_mask']]

In [14]:
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
tokenized_data['label'] = tokenized_data['label'].map(label_mapping)

In [15]:
tokenized_datasets = Dataset.from_pandas(tokenized_data)

In [16]:
train_valid_df, test_df = train_test_split(tokenized_data, test_size=0.2, random_state=42) #No more stratification here as this is a pandas DataFrame.
train_df, valid_df = train_test_split(train_valid_df, test_size=0.1, random_state=42) #No more stratification here either.

In [17]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

data_dict = DatasetDict({'train':train_dataset, 'validation': valid_dataset, 'test': test_dataset}) #Combine Datasets into a DatasetDict.

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,              # Adjust as needed
    per_device_train_batch_size=16,  # Adjust as needed
    per_device_eval_batch_size=64,   # Adjust as needed
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Corrected argument name
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",      # Use weighted f1 for imbalanced data
    report_to="none"  # Disable WandB integration
    # push_to_hub=True,  # Uncomment if using Hugging Face Hub. Requires login.
    # push_to_hub_model_id="your-model-id" # Use your model id if push_to_hub=True. Requires login.
)


In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')  # Use weighted F1
    return {"accuracy": accuracy, "f1": f1}

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_dict['train'],  # Access train set from DatasetDict
    eval_dataset=data_dict['validation'],  # Access validation set from DatasetDict
    compute_metrics=compute_metrics  # Pass the metrics function
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss



KeyboardInterrupt



In [None]:
predictions = trainer.predict(data_dict['test']) #Make sure to pass a Dataset object to predict.
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids
test_f1 = f1_score(true_labels, pred_labels, average='weighted')
test_accuracy = accuracy_score(true_labels, pred_labels)
print(f"Test F1: {test_f1}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
trainer.save_model("/kaggle/working/best_flipkart_model")