# Logistic regression
- predict categories (classes)
- linear regression predicts value

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
data = pd.DataFrame([("i love spending time with my friends and family", "positive"),
("that was the best meal i've ever had in my life", "positive"),
("i feel so grateful for everything i have in my life", "positive"),
("i received a promotion at work and i couldn't be happier", "positive"),
("watching a beautiful sunset always fills me with joy", "positive"),
("my partner surprised me with a thoughtful gift and it made my day", "positive"),
("i am so proud of my daughter for graduating with honors", "positive"),
("listening to my favorite music always puts me in a good mood", "positive"),
("i love the feeling of accomplishment after completing a challenging task", "positive"),
("i am excited to go on vacation next week", "positive"),
("i feel so overwhelmed with work and responsibilities", "negative"),
("the traffic during my commute is always so frustrating", "negative"),
("i received a parking ticket and it ruined my day", "negative"),
("i got into an argument with my partner and we're not speaking", "negative"),
("i have a headache and i feel terrible", "negative"),
("i received a rejection letter for the job i really wanted", "negative"),
("my car broke down and it's going to be expensive to fix", "negative"),
("i'm feeling sad because i miss my friends who live far away", "negative"),
("i'm frustrated because i can't seem to make progress on my project", "negative"),
("i'm disappointed because my team lost the game", "negative")], columns = ["text", "sentiment"])

In [37]:
data = data.sample(frac = 1).reset_index(drop=True) #100% of data in randomized order

In [12]:
x = data["text"]
y = data["sentiment"]

In [15]:
# each word = column, each sentence = row, entries = word count
countVec = CountVectorizer()
countVecFit = countVec.fit_transform(x)

In [19]:
bagOfWords = pd.DataFrame(countVecFit.toarray(), columns = countVec.get_feature_names_out())
bagOfWords

Unnamed: 0,accomplishment,after,always,am,an,and,argument,at,away,be,...,vacation,ve,wanted,was,watching,we,week,who,with,work
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Split data into traning and testing sets
# features = input data x
# label = output data y
xTrain, xTest, yTrain, yTest = train_test_split(bagOfWords, y, test_size = 0.3, random_state = 7) # 70% training data, random state same train/test split each time the code runs

In [24]:
lr = LogisticRegression(random_state = 1).fit(xTrain, yTrain) # takes traning data and teaches the model to find patterns

In [26]:
yPredictLr = lr.predict(xTest)

In [28]:
accuracy_score(yPredictLr, yTest)

0.3333333333333333

- precision: out of all sentences model predicted positive or negative, what proportion are correct
- recall: out of all sentences, what proportion did the model correctly find
- f1Score: combines precision and recall, currently poor 

In [31]:
print(classification_report(yTest, yPredictLr, zero_division = 0)) #more detailed info about performance

              precision    recall  f1-score   support

    negative       0.50      0.25      0.33         4
    positive       0.25      0.50      0.33         2

    accuracy                           0.33         6
   macro avg       0.38      0.38      0.33         6
weighted avg       0.42      0.33      0.33         6



# Naive bayes
- classification alg. that works using probabilities
- each word independent of the others
- used for spam detection or sentiment analysis

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [34]:
nb = MultinomialNB().fit(xTrain, yTrain)

In [35]:
yPredictNb = nb.predict(xTest)

In [36]:
accuracy_score(yPredictNb, yTest)

0.3333333333333333

# Linear support vector machine (SVM)
- finds the best possible boundary that separates the classes

In [38]:
from sklearn.linear_model import SGDClassifier

In [39]:
svm = SGDClassifier().fit(xTrain, yTrain)

In [40]:
yPredictSvm = svm.predict(xTest)

In [41]:
accuracy_score(yPredictSvm, yTest)

0.3333333333333333