In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [4]:


df = pd.read_csv(r"C:\Users\mupar\mubarak\sentimental.csv")


In [3]:
df = df[['Text', 'Score']].dropna()

In [19]:
# Map numerical scores to sentiment labels
def score_to_label(score):
    if score >= 4:
        return 1  # Positive
    elif score == 3:
        return 2  # Neutral
    else:
        return 0  # Negative

df['label'] = df['Score'].apply(score_to_label)


In [20]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [21]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [22]:

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)



In [23]:

train_dataset = Dataset.from_dict({**train_encodings, 'labels': train_labels})
test_dataset = Dataset.from_dict({**test_encodings, 'labels': test_labels})


In [24]:

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
pip install --upgrade transformers


Note: you may need to restart the kernel to use updated packages.


In [26]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",  
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


In [27]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    cm = confusion_matrix(labels, predictions)
    print("Confusion Matrix:\n", cm)
    return {'accuracy': acc, 'f1': f1}

In [28]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [29]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [30]:

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5073,0.415614,0.84989,0.81633
2,0.5042,0.424962,0.838852,0.812612
3,0.247,0.552116,0.84989,0.8246


Confusion Matrix:
 [[ 52  12   0]
 [ 18 333   0]
 [ 15  23   0]]




Confusion Matrix:
 [[ 46  15   3]
 [ 13 333   5]
 [ 12  25   1]]




Confusion Matrix:
 [[ 42  15   7]
 [  6 341   4]
 [  9  27   2]]


TrainOutput(global_step=681, training_loss=0.4456386355841847, metrics={'train_runtime': 5132.8554, 'train_samples_per_second': 1.057, 'train_steps_per_second': 0.133, 'total_flos': 356979129502464.0, 'train_loss': 0.4456386355841847, 'epoch': 3.0})

In [31]:

results = trainer.evaluate()
print(results)



Confusion Matrix:
 [[ 52  12   0]
 [ 18 333   0]
 [ 15  23   0]]
{'eval_loss': 0.41561371088027954, 'eval_accuracy': 0.8498896247240618, 'eval_f1': 0.8163304759663454, 'eval_runtime': 95.0023, 'eval_samples_per_second': 4.768, 'eval_steps_per_second': 0.6, 'epoch': 3.0}


In [6]:
X = df['Text']
y = df['Score']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tfidf = TfidfVectorizer(stop_words='english', max_features=500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)


y_pred = lr.predict(X_test_tfidf)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6622516556291391

Classification Report:
               precision    recall  f1-score   support

           1       0.81      0.30      0.44        43
           2       0.00      0.00      0.00        21
           3       0.25      0.05      0.09        38
           4       0.25      0.05      0.08        66
           5       0.68      0.99      0.80       285

    accuracy                           0.66       453
   macro avg       0.40      0.28      0.28       453
weighted avg       0.56      0.66      0.57       453



In [12]:
model_name = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = BertTokenizer.from_pretrained(model_name)
print(f"Model and tokenizer saved to {model_path}")
model_path = r"C:\Users\mupar\mubarak\sentiment_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model and tokenizer saved to {model_path}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer saved to 
Model and tokenizer saved to C:\Users\mupar\mubarak\sentiment_model


In [13]:
loaded_model = BertForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = BertTokenizer.from_pretrained(model_path)
loaded_model.eval()
print("Model and tokenizer loaded successfully")

Model and tokenizer loaded successfully


In [14]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import os


model_path = r"C:\Users\mupar\mubarak\sentiment_model"  


if not os.path.exists(model_path):
    raise FileNotFoundError(f"The path {model_path} does not exist.")


tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set model to evaluation mode


label_map = {0: "Negative", 1: "Positive", 2: "Neutral"}


while True:
    review = input("Enter an app review (or 'quit' to exit): ")
    if review.lower() == "quit":
        break

    
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)

    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_label = torch.argmax(logits, dim=1).item()

    
    sentiment = label_map[pred_label]
    print(f"Sentiment: {sentiment}")


Enter an app review (or 'quit' to exit):  dog food is good


Sentiment: Neutral


Enter an app review (or 'quit' to exit):  quit
