In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:

#data inmport and column define
data = pd.DataFrame([("i love spending time with my friends and family", "positive"),
                     ("that was the best meal i've ever had in my life", "positive"),
                     ("i feel so grateful for everything i have in my life", "positive"),
                     ("i received a promotion at work and i couldn't be happier", "positive"),
                     ("watching a beautiful sunset always fills me with joy", "positive"),
                     ("my partner surprised me with a thoughtful gift and it made my day", "positive"),
                     ("i am so proud of my daughter for graduating with honors", "positive"),
                     ("listening to my favorite music always puts me in a good mood", "positive"),
                     ("i love the feeling of accomplishment after completing a challenging task", "positive"),
                     ("i am excited to go on vacation next week", "positive"),
                     ("i feel so overwhelmed with work and responsibilities", "negative"),
                     ("the traffic during my commute is always so frustrating", "negative"),
                     ("i received a parking ticket and it ruined my day", "negative"),
                     ("i got into an argument with my partner and we're not speaking", "negative"),
                     ("i have a headache and i feel terrible", "negative"),
                     ("i received a rejection letter for the job i really wanted", "negative"),
                     ("my car broke down and it's going to be expensive to fix", "negative"),
                     ("i'm feeling sad because i miss my friends who live far away", "negative"),
                     ("i'm frustrated because i can't seem to make progress on my project", "negative"),
                     ("i'm disappointed because my team lost the game", "negative")
                    ],
                    columns=['text', 'sentiment'])

In [3]:
data.head()

Unnamed: 0,text,sentiment
0,i love spending time with my friends and family,positive
1,that was the best meal i've ever had in my life,positive
2,i feel so grateful for everything i have in my...,positive
3,i received a promotion at work and i couldn't ...,positive
4,watching a beautiful sunset always fills me wi...,positive


In [4]:
# Shuffle the entire dataset randomly
# Reset the index after shuffling (drop=True removes the old index)
# frac = fraction of rows to return
# frac=1 → return 100% of the rows, but in a random order

# Shuffle all rows (frac=1) and reset index
data = data.sample(frac=1).reset_index(drop=True)

In [7]:
X=data['text']
Y=data['sentiment']

In [8]:
# text vectorization to bow - CountVectorizer

# Create a CountVectorizer object (this converts text into a bag-of-words matrix)
countvec = CountVectorizer()

# Learn the vocabulary from X and transform the text into a numeric matrix
# X is your text column / dataset
countvec_fit = countvec.fit_transform(X)

# Convert the sparse matrix into a DataFrame so it's easy to see
# toarray() → converts matrix to normal array
# get_feature_names_out() → returns the list of words (columns)
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns = countvec.get_feature_names_out())

In [9]:
bag_of_words

Unnamed: 0,accomplishment,after,always,am,an,and,argument,at,away,be,...,vacation,ve,wanted,was,watching,we,week,who,with,work
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
# Split the dataset into training and testing sets
# bag_of_words → input features
# y → target/output labels
# test_size=0.3 → 30% of data goes to testing, 70% goes to training
# random_state=7 → ensures the split is the same every time (for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(bag_of_words,y,test_size=0.3,random_state=7)


Logistic Regresion

In [11]:
lr = LogisticRegression(random_state=1).fit(X_train, y_train)

In [12]:
y_pred_lr = lr.predict(X_test)

In [13]:
accuracy_score(y_pred_lr, y_test)


0.6666666666666666

In [14]:
y_pred_lr

array(['negative', 'positive', 'positive', 'negative', 'negative',
       'positive'], dtype=object)

In [15]:
Y_test

NameError: name 'Y_test' is not defined

In [16]:
y_test


Unnamed: 0,sentiment
1,positive
17,positive
2,positive
5,negative
11,negative
0,negative


In [17]:
print(classification_report(y_test, y_pred_lr, zero_division=0))

              precision    recall  f1-score   support

    negative       0.67      0.67      0.67         3
    positive       0.67      0.67      0.67         3

    accuracy                           0.67         6
   macro avg       0.67      0.67      0.67         6
weighted avg       0.67      0.67      0.67         6



Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
nb = MultinomialNB().fit(X_train, y_train)

In [20]:
y_pred_nb = nb.predict(X_test)

In [21]:
accuracy_score(y_pred_nb, y_test)


0.5

In [22]:
print(classification_report(y_test, y_pred_nb, zero_division=0))


              precision    recall  f1-score   support

    negative       0.50      0.33      0.40         3
    positive       0.50      0.67      0.57         3

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.49         6
weighted avg       0.50      0.50      0.49         6



Linear Support Vector Machine

In [23]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [24]:
svm = SGDClassifier().fit(X_train, y_train)
# possible hyper params, loss function, regularization

In [25]:
y_pred_svm = svm.predict(X_test)

In [26]:
accuracy_score(y_pred_svm, y_test)

0.8333333333333334

In [27]:
print(classification_report(y_test, y_pred_svm, zero_division=0))

              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         3
    positive       1.00      0.67      0.80         3

    accuracy                           0.83         6
   macro avg       0.88      0.83      0.83         6
weighted avg       0.88      0.83      0.83         6

