In [306]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,f1_score,recall_score,precision_score
import nltk, pickle, re
from textblob import Word
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob

# loading data

In [307]:
df = pd.read_csv("sentimentdataset.csv")
df.head()

Unnamed: 0,ID,Text,Sentiment (Label),Timestamp,User,Source,Topic,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,Enjoying a beautiful day at the park! ...,Positive,1/15/2023 12:30,User123,Twitter,#Nature #Park,15,30,USA,2023,1,15,12
1,1,Traffic was terrible this morning. ...,Negative,1/15/2023 8:45,CommuterX,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8
2,2,Just finished an amazing workout! 💪 ...,Positive,1/15/2023 15:45,FitnessFan,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15
3,3,Excited about the upcoming weekend getaway! ...,Positive,1/15/2023 18:20,AdventureX,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18
4,4,Trying out a new recipe for dinner tonight. ...,Neutral,1/15/2023 19:55,ChefCook,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19


In [308]:
df.shape

(732, 14)

# methods

In [309]:
stop_words = set(stopwords.words('english'))
def preprocessText(str):
    words = word_tokenize(str)
    words = [re.sub(r'\W+', '', word) for word in words]
    words = [word.lower() for word in words]
    words = [Word(word).lemmatize() for word in words]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)


In [310]:
def labelEncoding(data, cols, dict):
    for c in cols:
        lbl = LabelEncoder()
        lbl.fit(list(data[c].values)) 
        data[c] = lbl.transform(data[c])
        dict[c] = lbl
    return data, dict


In [311]:
def inverseEncoding(y, lbl):
    return lbl.inverse_transform(y)

In [312]:
def convert_sentiment(text):
    blob = TextBlob(text.strip())
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# null checking - dropping duplicates

In [313]:
df.isna().sum()

ID                   0
Text                 0
Sentiment (Label)    0
Timestamp            0
User                 0
Source               0
Topic                0
Retweets             0
Likes                0
Country              0
Year                 0
Month                0
Day                  0
Hour                 0
dtype: int64

In [314]:
print("shape of data BEFORE dropping duplicates: ", df.shape)
df.drop_duplicates(subset="Text", inplace=True)
print("shape of data AFTER dropping duplicates: ", df.shape)

shape of data BEFORE dropping duplicates:  (732, 14)
shape of data AFTER dropping duplicates:  (707, 14)


# pre-processing

In [315]:
df.rename(columns = {'Sentiment (Label)':'Label'}, inplace = True)

df['Text'] = df['Text'].apply(preprocessText)  
df['Label'] = df['Label'].apply(convert_sentiment)  

df['Topic'] = df['Topic'].str.replace('#', '')
newColumns = df['Topic'].str.split(' ', expand=True)
df['Topic 1'] = newColumns[1]
df['Topic 2'] = newColumns[2]
df.drop(columns=['User', 'Timestamp','ID', 'Topic'], inplace=True)

df.head(10)

Unnamed: 0,Text,Label,Source,Retweets,Likes,Country,Year,Month,Day,Hour,Topic 1,Topic 2
0,enjoying beautiful day park,Positive,Twitter,15,30,USA,2023,1,15,12,Nature,Park
1,traffic wa terrible morning,Negative,Twitter,5,10,Canada,2023,1,15,8,Traffic,Morning
2,finished amazing workout,Positive,Instagram,20,40,USA,2023,1,15,15,Fitness,Workout
3,excited upcoming weekend getaway,Positive,Facebook,8,15,UK,2023,1,15,18,Travel,Adventure
4,trying new recipe dinner tonight,Neutral,Instagram,12,25,Australia,2023,1,15,19,Cooking,Food
5,feeling grateful little thing life,Positive,Twitter,25,50,India,2023,1,16,9,Gratitude,PositiveVibes
6,rainy day call cozy blanket hot cocoa,Positive,Facebook,10,20,Canada,2023,1,16,14,RainyDays,Cozy
7,new movie release mustwatch,Positive,Instagram,15,30,USA,2023,1,16,19,MovieNight,MustWatch
8,political discussion heating timeline,Negative,Twitter,30,60,USA,2023,1,17,8,Politics,Debate
9,missing summer vibe beach day,Neutral,Facebook,18,35,Australia,2023,1,17,12,Summer,BeachDays


# Encoding

In [316]:
cols = ('Label','Topic 1', 'Topic 2', 'Country', 'Source')
lbl = {}
df, lbl = labelEncoding(df, cols, lbl)
df.head(10)

Unnamed: 0,Text,Label,Source,Retweets,Likes,Country,Year,Month,Day,Hour,Topic 1,Topic 2
0,enjoying beautiful day park,2,3,15,30,106,2023,1,15,12,242,401
1,traffic wa terrible morning,0,3,5,10,20,2023,1,15,8,354,361
2,finished amazing workout,2,1,20,40,108,2023,1,15,15,151,628
3,excited upcoming weekend getaway,2,0,8,15,91,2023,1,15,18,356,7
4,trying new recipe dinner tonight,1,1,12,25,0,2023,1,15,19,87,168
5,feeling grateful little thing life,2,3,25,50,49,2023,1,16,9,172,417
6,rainy day call cozy blanket hot cocoa,2,0,10,20,20,2023,1,16,14,282,93
7,new movie release mustwatch,2,1,15,30,104,2023,1,16,19,236,369
8,political discussion heating timeline,0,3,30,60,106,2023,1,17,8,271,113
9,missing summer vibe beach day,1,0,18,35,0,2023,1,17,12,330,30


# Vectorization

### BOW

In [317]:
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['Text'])

### tf-idf

In [318]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 3))
X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])

#target labels 
y = df['Label']

# splitting data

In [319]:
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Models

### SVM and logistic regression


In [320]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

### evaluation

##### SVC

In [321]:
# Predictions for SVC
y_pred_svc_train = svc_model.predict(X_train)
y_pred_svc_test = svc_model.predict(X_test)

# Accuracy on train and test sets for SVC
print("SVC Train Accuracy:", accuracy_score(y_train, y_pred_svc_train))
print("SVC Test Accuracy:", accuracy_score(y_test, y_pred_svc_test))

print("precision : " , precision_score(y_test, y_pred_svc_test, average ='weighted'))
print("recall : ", recall_score(y_test, y_pred_svc_test, average ='weighted'))
print("f1 score : ", f1_score(y_test, y_pred_svc_test, average ='weighted'))

SVC Train Accuracy: 1.0
SVC Test Accuracy: 0.7676056338028169
precision :  0.7648599932930918
recall :  0.7676056338028169
f1 score :  0.7661864846993557


##### logistic

In [322]:
# Predictions for Logistic Regression
y_pred_logistic_train = logistic_model.predict(X_train)
y_pred_logistic_test = logistic_model.predict(X_test)

# Accuracy on train and test sets for Logistic Regression
print("Logistic Regression Train Accuracy:",  accuracy_score(y_train, y_pred_logistic_train))
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_logistic_test))

print("precision : " , precision_score(y_test, y_pred_logistic_test, average ='weighted'))
print("recall : ", recall_score(y_test, y_pred_logistic_test, average ='weighted'))
print("f1 score : ", f1_score(y_test, y_pred_logistic_test, average ='weighted'))

Logistic Regression Train Accuracy: 0.9929203539823008
Logistic Regression Test Accuracy: 0.7676056338028169
precision :  0.7389837825027086
recall :  0.7676056338028169
f1 score :  0.7321610792258505
