In [981]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,f1_score,recall_score,precision_score
import nltk, pickle, re
from textblob import Word
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /home/mostafa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mostafa/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# loading data

In [982]:
df = pd.read_csv("sentimentdataset.csv")
df.head()

Unnamed: 0,ID,Text,Sentiment (Label),Timestamp,User,Source,Topic,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,Enjoying a beautiful day at the park! ...,Positive,1/15/2023 12:30,User123,Twitter,#Nature #Park,15,30,USA,2023,1,15,12
1,1,Traffic was terrible this morning. ...,Negative,1/15/2023 8:45,CommuterX,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8
2,2,Just finished an amazing workout! 💪 ...,Positive,1/15/2023 15:45,FitnessFan,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15
3,3,Excited about the upcoming weekend getaway! ...,Positive,1/15/2023 18:20,AdventureX,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18
4,4,Trying out a new recipe for dinner tonight. ...,Neutral,1/15/2023 19:55,ChefCook,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19


In [983]:
df.shape

(732, 14)

# methods

In [984]:
stop_words = set(stopwords.words('english'))
def preprocessText(str):
    words = word_tokenize(str)
    words = [re.sub(r'\W+', '', word) for word in words]
    words = [word.lower() for word in words]
    words = [Word(word).lemmatize() for word in words]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)


In [985]:
def labelEncoding(data, cols, dict):
    for c in cols:
        lbl = LabelEncoder()
        lbl.fit(list(data[c].values)) 
        data[c] = lbl.transform(data[c])
        dict[c] = lbl
    return data, dict


In [986]:
def inverseEncoding(y, dict):
    reversed = {v: k for k, v in dict.items()}
    return reversed[y]

In [987]:
def convert_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    polarity = analyzer.polarity_scores(text)['compound']
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [988]:
def remove_emojis(text):
    # Define a regular expression pattern to match emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    # Remove emojis from the text using the pattern
    return emoji_pattern.sub(r'', text)

In [989]:
def remove_special_characters(text):
    # Define the pattern to match special characters
    pattern = r'[$#]+|\.{3}'
    
    # Replace the matched pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

# null checking - dropping duplicates

In [990]:
df.isna().sum()

ID                   0
Text                 0
Sentiment (Label)    0
Timestamp            0
User                 0
Source               0
Topic                0
Retweets             0
Likes                0
Country              0
Year                 0
Month                0
Day                  0
Hour                 0
dtype: int64

In [991]:
print("shape of data BEFORE dropping duplicates: ", df.shape)
df.drop_duplicates(subset="Text", inplace=True)
print("shape of data AFTER dropping duplicates: ", df.shape)

shape of data BEFORE dropping duplicates:  (732, 14)
shape of data AFTER dropping duplicates:  (707, 14)


# pre-processing

In [992]:
df.rename(columns = {'Sentiment (Label)':'Label'}, inplace = True)

df['Text'] = df['Text'].apply(lambda x: preprocessText(x))  
df['Text'] = df['Text'].apply(remove_emojis)
df['Text'] = df['Text'].apply(remove_special_characters) #to remove hashtags
df['Label'] = df['Label'].apply(lambda x: convert_sentiment(x))  

df['Topic'] = df['Topic'].str.replace('#', '')
newColumns = df['Topic'].str.split(' ', expand=True)
df['Topic 1'] = newColumns[1]
df['Topic 2'] = newColumns[2]
df.drop(columns=['User', 'Timestamp','ID', 'Topic'], inplace=True)

df.head(10)
df.to_csv("afterEncoding.csv", sep=',', index=False, encoding='utf-8')

# Encoding

In [993]:
cols = ('Label','Topic 1', 'Topic 2', 'Country', 'Source')
lbl = {}
df, lbl = labelEncoding(df, cols, lbl)
df.head(10)

Unnamed: 0,Text,Label,Source,Retweets,Likes,Country,Year,Month,Day,Hour,Topic 1,Topic 2
0,enjoying beautiful day park,2,3,15,30,106,2023,1,15,12,242,401
1,traffic wa terrible morning,0,3,5,10,20,2023,1,15,8,354,361
2,finished amazing workout,2,1,20,40,108,2023,1,15,15,151,628
3,excited upcoming weekend getaway,2,0,8,15,91,2023,1,15,18,356,7
4,trying new recipe dinner tonight,1,1,12,25,0,2023,1,15,19,87,168
5,feeling grateful little thing life,2,3,25,50,49,2023,1,16,9,172,417
6,rainy day call cozy blanket hot cocoa,2,0,10,20,20,2023,1,16,14,282,93
7,new movie release mustwatch,2,1,15,30,104,2023,1,16,19,236,369
8,political discussion heating timeline,0,3,30,60,106,2023,1,17,8,271,113
9,missing summer vibe beach day,1,0,18,35,0,2023,1,17,12,330,30


# Reducing Data Biases

### Train-Test Split

In [994]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle
# separating the data based on value of Label
type1 = df[df['Label'] == 0]
type2 = df[df['Label'] == 1] 
type3 = df[df['Label'] == 2]

type1Train = type1.iloc[0:int(len(type1)*0.8)]
type2Train = type2.iloc[0:int(len(type2)*0.8)]
type3Train = type3.iloc[0:int(len(type3)*0.8)]

type1Test = type1.iloc[int(len(type1)*0.8):int(len(type1))+1]
type2Test = type2.iloc[int(len(type2)*0.8):int(len(type2))+1]
type3Test = type3.iloc[int(len(type3)*0.8):int(len(type3))+1]

# print(type1Train.shape)
# print(type2Train.shape)
# print(type3Train.shape)
dfTrain = pd.concat([type1Train, type2Train, type3Train], ignore_index=True)
dfTest = pd.concat([type1Test, type2Test, type3Test])
# print(dfTrain.shape)
# print(dfTest.shape)
# print(df.shape)



### Applying SMOTE 

In [995]:
# respresenting as a TF-IDF vector
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(dfTrain['Text'])
y_train = dfTrain['Label']
x_test = vectorizer.transform(dfTest['Text'])
y_test = dfTest['Label']

# using SMOTE to reduce data imbalances
overSample = SMOTE(sampling_strategy='not majority')
x_train_over, y_train_over = overSample.fit_resample(x_train, y_train)
tfidf_vectorizer = vectorizer 
print(dfTrain['Label'].value_counts())
print(y_train_over.value_counts())

Label
2    286
0    144
1    134
Name: count, dtype: int64
Label
0    286
1    286
2    286
Name: count, dtype: int64


# Models

### SVM and logistic regression


In [996]:
svc_model = SVC(kernel='linear')
svc_model.fit(x_train_over, y_train_over)

logistic_model = LogisticRegression(max_iter=1000,random_state=42)
logistic_model.fit(x_train_over, y_train_over)

### evaluation

##### SVC

In [997]:
# Predictions for SVC
y_pred_svc_train = svc_model.predict(x_train)
y_pred_svc_test = svc_model.predict(x_test)

# Accuracy on train and test sets for SVC
print("SVC Train Accuracy:", accuracy_score(y_train, y_pred_svc_train))
print("SVC Test Accuracy:", accuracy_score(y_test, y_pred_svc_test))
print("precision : " , precision_score(y_test, y_pred_svc_test, average ='weighted'))
print("recall : ", recall_score(y_test, y_pred_svc_test, average ='weighted'))
print("f1 score : ", f1_score(y_test, y_pred_svc_test, average ='weighted'))

SVC Train Accuracy: 0.9911347517730497
SVC Test Accuracy: 0.7342657342657343
precision :  0.7391708741034079
recall :  0.7342657342657343
f1 score :  0.7232176439914133


##### logistic

In [998]:
# Predictions for Logistic Regression
y_pred_logistic_train = logistic_model.predict(x_train)
y_pred_logistic_test = logistic_model.predict(x_test)

# Accuracy on train and test sets for Logistic Regression
print("Logistic Regression Train Accuracy:",  accuracy_score(y_train, y_pred_logistic_train))
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_logistic_test))

print("precision : " , precision_score(y_test, y_pred_logistic_test, average ='weighted'))
print("recall : ", recall_score(y_test, y_pred_logistic_test, average ='weighted'))
print("f1 score : ", f1_score(y_test, y_pred_logistic_test, average ='weighted', zero_division=0))

Logistic Regression Train Accuracy: 0.9858156028368794
Logistic Regression Test Accuracy: 0.7202797202797203
precision :  0.717295291917385
recall :  0.7202797202797203
f1 score :  0.7115986103701


In [999]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=400)
rf = clf.fit(x_train_over, y_train_over)

In [1000]:

# Predictions for Logistic Regression
y_pred_rf_train = rf.predict(x_train)
y_pred_rf_test = rf.predict(x_test)

# Accuracy on train and test sets for Logistic Regression
print("rf train Accuracy:",  accuracy_score(y_train, y_pred_rf_train))
print("rf Test Accuracy:", accuracy_score(y_test, y_pred_rf_test))

print("precision : " , precision_score(y_test, y_pred_rf_test, average ='weighted'))
print("recall : ", recall_score(y_test, y_pred_rf_test, average ='weighted'))
print("f1 score : ", f1_score(y_test, y_pred_rf_test, average ='weighted', zero_division=0))

rf train Accuracy: 1.0
rf Test Accuracy: 0.6853146853146853
precision :  0.7633158508158508
recall :  0.6853146853146853
f1 score :  0.6557120367034818


In [1001]:
pickle.dump(svc_model, open('trained_model.sav', 'wb'))
pickle.dump(tfidf_vectorizer, open('vectorized_model.sav', 'wb'))
pickle.dump(lbl, open('label_encoder.sav', 'wb'))