### Sentiment analysis of movies

In [8]:
!pip install streamlit scikit-learn pandas datasets joblib matplotlib seaborn


Collecting streamlit
  Using cached streamlit-1.44.1-py3-none-any.whl (9.8 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
Collecting joblib
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Using cached GitPython-3.1.44-py3-none-any.whl (207 kB)
Collecting altair<6,>=4.0
  Using cached altair-5.5.0-py3-none-any.whl (731 kB)
Collecting protobuf<6,>=3.20
  Using cached protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
Collecting tenacity<10,>=8.1.0
  Using cached tenacity-9.1.2-py3-none-any.whl (28 kB)
Collecting toml<2,>=0.10.1
  Using cached toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting pydeck<1,>=0.8.0b4
  Using cached pydeck-0.9.1-

In [2]:
!pip install googletrans==4.0.0-rc1 datasets

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl (491 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m530.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting hstspreload
  Downloading hstspreload-2025.1.1-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m555.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting idna==2.*
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m809.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting chardet==3.*
  Downloading chardet-

### Importing modules


In [9]:
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


##### Getting data from huggingface 

In [10]:
# Load dataset
dataset = load_dataset("sara-nabhani/ML-news-sentiment")
df = dataset['train'].to_pandas()

sentiment_train.csv:   0%|          | 0.00/627k [00:00<?, ?B/s]

sentiment_test.csv:   0%|          | 0.00/70.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4551 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/506 [00:00<?, ? examples/s]

In [12]:
df

Unnamed: 0,text,labels,label_name
0,Finnish airline Finnair is starting the tempor...,0,negative
1,The corresponding increase in the share capita...,1,neutral
2,In the third quarter of fiscal 2008 Efore swun...,0,negative
3,"ALEXANDRIA , Va. , Oct. 15 -- Aaron Moss of Ha...",1,neutral
4,Vaisala Oyj Stock exchange release 26.03.2010 ...,1,neutral
...,...,...,...
4546,The company 's main shareholders Lihakunta and...,1,neutral
4547,According to Nordic financial group Nordea 's ...,2,positive
4548,A 2001 agreement expired earlier this month .,1,neutral
4549,"The customer is then forwarded to the site , a...",1,neutral


In [14]:
# Prepare data
X = df['text']
y = df['labels']  # Assuming labels are 0:negative, 1:neutral, 2:positive


In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### TFIDF vectorization pipeline

In [16]:
# Create model pipelines
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, sublinear_tf=True)

In [17]:
# Model 1: Logistic Regression
lr_pipe = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression(
        class_weight='balanced',
        solver='saga',
        multi_class='multinomial',
        max_iter=1000))
])

In [18]:

# Model 2: Support Vector Machine
svm_pipe = Pipeline([
    ('tfidf', tfidf),
    ('clf', SVC(
        class_weight='balanced',
        kernel='linear',
        probability=True))
])

In [19]:
# Train and save models
for name, model in [('Logistic Regression', lr_pipe), 
                   ('SVM', svm_pipe)]:
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    joblib.dump(model, f'{name.lower().replace(" ", "_")}_model.joblib')
    
    # Generate metrics
    y_pred = model.predict(X_test)
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))

    # Save confusion matrix plot
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Negative', 'Neutral', 'Positive'],
                yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.title(f'{name} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'{name.lower().replace(" ", "_")}_cm.png')
    plt.close()

Training Logistic Regression...





Logistic Regression Performance:
Accuracy: 0.7618
              precision    recall  f1-score   support

           0       0.60      0.72      0.66       120
           1       0.82      0.83      0.83       536
           2       0.71      0.63      0.67       255

    accuracy                           0.76       911
   macro avg       0.71      0.73      0.72       911
weighted avg       0.76      0.76      0.76       911

Training SVM...

SVM Performance:
Accuracy: 0.7508
              precision    recall  f1-score   support

           0       0.55      0.67      0.60       120
           1       0.83      0.82      0.83       536
           2       0.70      0.64      0.67       255

    accuracy                           0.75       911
   macro avg       0.69      0.71      0.70       911
weighted avg       0.76      0.75      0.75       911

