In [4]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [5]:
from datasets import load_dataset


dataset = load_dataset("mteb/tweet_sentiment_extraction")


README.md:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/3.63M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/465k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27481 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [6]:

print(dataset)
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})
{'id': 'cb774db0d1', 'text': ' I`d have responded, if I were going', 'label': 1, 'label_text': 'neutral'}


In [8]:
train_labels = set(dataset['train']['label'])
train_label_texts = set(dataset['train']['label_text'])

print(f"Train set unique labels: {train_labels}")
print(f"Train set unique label texts: {train_label_texts}")


Train set unique labels: {0, 1, 2}
Train set unique label texts: {'negative', 'positive', 'neutral'}


In [9]:
pip install scikit-learn



In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from datasets import load_dataset
import numpy as np

dataset = load_dataset("mteb/tweet_sentiment_extraction")


train_texts = dataset['train']['text']
train_labels = dataset['train']['label']


vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)


X_train, X_val, y_train, y_val = train_test_split(X_train, train_labels, test_size=0.2, random_state=42)


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": MultinomialNB()
}


results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    report = classification_report(y_val, y_pred, output_dict=True)
    results[model_name] = report
    print(f"{model_name} Report:")
    print(classification_report(y_val, y_pred))


best_model = None
best_f1 = 0

for model_name, report in results.items():
    f1_positive = report['2']['f1-score']
        best_f1 = f1_positive
        best_model = model_name

print(f"\nBest Model: {best_model} with F1-score: {best_f1}")


Training Logistic Regression...
Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.73      0.61      0.67      1562
           1       0.63      0.76      0.69      2230
           2       0.78      0.70      0.74      1705

    accuracy                           0.70      5497
   macro avg       0.72      0.69      0.70      5497
weighted avg       0.71      0.70      0.70      5497

Training SVM...
SVM Report:
              precision    recall  f1-score   support

           0       0.77      0.57      0.65      1562
           1       0.61      0.81      0.70      2230
           2       0.80      0.67      0.73      1705

    accuracy                           0.70      5497
   macro avg       0.73      0.68      0.69      5497
weighted avg       0.72      0.70      0.70      5497

Training Random Forest...
Random Forest Report:
              precision    recall  f1-score   support

           0       0.75      0.53      0.62    

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l1', 'l2']
}


grid_search = GridSearchCV(estimator=LogisticRegression(max_iter=1000),
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)


print("Training Logistic Regression with Grid Search...")
grid_search.fit(X_train, y_train)


best_logistic_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)


y_pred = best_logistic_model.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=True)
print(f"Logistic Regression Report with Best Parameters:")
print(classification_report(y_val, y_pred))


f1_positive = report['2']['f1-score']
print(f"\nBest Logistic Model F1-score for Positive Class: {f1_positive}")

Training Logistic Regression with Grid Search...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 56, in _check_solver
    

Best parameters found:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Logistic Regression Report with Best Parameters:
              precision    recall  f1-score   support

           0       0.77      0.60      0.67      1562
           1       0.65      0.78      0.71      2230
           2       0.78      0.73      0.75      1705

    accuracy                           0.71      5497
   macro avg       0.73      0.70      0.71      5497
weighted avg       0.72      0.71      0.71      5497


Best Logistic Model F1-score for Positive Class: 0.7544656372994247
