In [1]:
from datasets import load_dataset

# https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset/tree/main
train = load_dataset("csv", data_files="train_df.csv", split="train")
# train = load_dataset("csv", data_files="train_df.csv")
test = load_dataset("csv", data_files="test_df.csv", split="train")
val = load_dataset("csv", data_files="val_df.csv", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
train

Dataset({
    features: ['id', 'text', 'label', 'sentiment'],
    num_rows: 31232
})

In [3]:
test

Dataset({
    features: ['id', 'text', 'label', 'sentiment'],
    num_rows: 5206
})

In [4]:
val

Dataset({
    features: ['id', 'text', 'label', 'sentiment'],
    num_rows: 5205
})

In [29]:
[(t, l) for t, l in zip(train['text'][:5], train['label'][:5])]

[('Cooking microwave pizzas, yummy', 2),
 ('Any plans of allowing sub tasks to show up in the widget?', 1),
 (" I love the humor, I just reworded it. Like saying 'group therapy' instead`a 'gang banging'. Keeps my moms off my back.   Hahaha",
  2),
 (' naw idk what ur talkin about', 1),
 (' That sucks to hear. I hate days like that', 0)]

## 0: Negative

## 1: Neutral

## 2: Positive

In [27]:
from collections import Counter

print(f"{Counter(train['label'])=}")
Counter(test['label'])

Counter(train['label'])=Counter({1: 11649, 2: 10478, 0: 9105})


Counter({1: 1930, 2: 1730, 0: 1546})

In [10]:
import string
import nltk
from nltk.corpus import words

nltk.download('words')
eng_words = set(words.words())

print(string.digits + string.punctuation)

def text_preprocessing(text):
    if not text:
        return ""
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation + string.digits))
    tokens = [word for word in text.split() if word.lower() in eng_words]
    return tokens

0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[nltk_data] Downloading package words to /home/nand-ml/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
vocab = set()

for sentence in train['text']:
    tokens = text_preprocessing(sentence)
    vocab.update(tokens)


vocab = sorted(list(vocab))
print(len(vocab))
vocab[:10]

9725


['a',
 'aa',
 'abalone',
 'abandon',
 'abandoned',
 'abandonment',
 'abbey',
 'abbreviation',
 'abiding',
 'ability']

In [12]:
def create_bow_vec(sentence, vocab):
    vec = [0] * len(vocab)
    for word in text_preprocessing(sentence):
        if word in vocab:
            idx = vocab.index(word)
            vec[idx] += 1
    
    return vec

a = create_bow_vec("who are you ?", vocab)
print(len(a))
print([id for id, ele in enumerate(a) if ele == 1])

9725
[424, 9489, 9693]


In [13]:
bow_vectors = [create_bow_vec(sentence, vocab) for sentence in train['text']]

In [17]:
bow_vec_test = [create_bow_vec(sentence, vocab) for sentence in test['text']]

In [31]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=16, max_iter=50_000)
logreg.fit(bow_vectors[:1000], train['label'][:1000])

In [35]:
from sklearn import metrics

y_pred = logreg.predict(bow_vec_test)

conf_matrix = metrics.confusion_matrix(y_pred, test['label'])
print(f"{conf_matrix=}")

f1 = metrics.f1_score(y_pred=y_pred, y_true=test['label'], average='macro')
print(f"{f1=}")

conf_matrix=array([[ 801,  562,  198],
       [ 562, 1004,  568],
       [ 183,  364,  964]])
f1=0.5348608417747402


In [39]:
# 0: Negative
# 1: Neutral
# 2: Positive


test['text'][:10], test['label'][:10], y_pred[:10]

(['getting cds ready for tour',
  ' MC, happy mother`s day to your mom ;).. love yah',
  'A year from now is graduation....i am pretty sure i`m not ready for it!?!?!?',
  ' because you had chips and sale w/o me',
  'Great for organising my work life balance',
  ' its my going away partyyy  `s.  you should come!',
  'Is Watching Britains Got Talent, & Is biting her nails. Please don`t come off black nail varnish',
  ' remember the guy who 1st #tweetbud you! ~> _2890  help him get 900 flwrs & make him smile!',
  ' She! Maybe that was our first mistake.  Not everyone is as cool as   (brown nose moment)',
  'today i was so happy got i got off school early, but now i`m bored!!'],
 [1, 2, 0, 1, 2, 1, 1, 2, 0, 1],
 array([1, 2, 1, 1, 2, 1, 1, 0, 0, 1]))

In [42]:
for i in range(50):
    if (test['label'][i], y_pred[i]) in [(0, 2), (2, 0)]:
        sentiment = "+ve" if test['label'][i] == 2 else "-ve"
        predicted = "+ve" if y_pred[i] == 2 else "-ve"
        print(f"Sentiment: {sentiment} | Predicted: {predicted} | Text: {test['text'][i]}")

Sentiment: +ve | Predicted: -ve | Text:  remember the guy who 1st #tweetbud you! ~> _2890  help him get 900 flwrs & make him smile!
Sentiment: +ve | Predicted: -ve | Text: I loved this app. Worked so great I purchased it. Now it has logged me out and says I don't exist.
Sentiment: -ve | Predicted: +ve | Text: Widget is mostly useless - it constantly displays no items while having a list selected that is well populated. Alternately, the widget becomes logged out somehow. Hope this gets fixed.
Sentiment: +ve | Predicted: -ve | Text:  Oh I believe you...   sooo  thinks she belongs elsewhere, I say I dont think so...
Sentiment: -ve | Predicted: +ve | Text:  I was never a good stereotypical teen
Sentiment: +ve | Predicted: -ve | Text: Forest is cute, unobtrusive, and has genuinely helped me in the transition from checking my phone to doing work. A neat option is the ability to allow certain apps (eg, Spotify to change music)
Sentiment: -ve | Predicted: +ve | Text:  yay asap lol awwh I miss 