###  Load Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./Sentiment_Analysis_Dataset.csv', on_bad_lines='skip')

In [3]:
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


### Preprocess the data

In [4]:
# set sentiment text in to lowercase
df['SentimentText'] = df['SentimentText'].str.lower()

### Split dataset

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
texts = df['SentimentText'].values
labels = df['Sentiment'].values

In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

### Feature extraction

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(max_features=10000)
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)

### Train a model

In [10]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

* ##### SVM

In [11]:
SVC_model = LinearSVC()
SVC_model.fit(train_features, train_labels) # train classifier



In [12]:
predictions = SVC_model.predict(test_features)

svm_p_score = precision_score(test_labels, predictions)
svm_r_score = recall_score(test_labels, predictions)
svm_f_score = f1_score(test_labels, predictions)
svm_a_score = accuracy_score(test_labels, predictions)
print("Precision:", svm_p_score)
print("Recall:", svm_r_score)
print("F1-Score:", svm_f_score)
print("Accuracy:", svm_a_score)

Precision: 0.7866096953674956
Recall: 0.8095132645378449
F1-Score: 0.7978971525499832
Accuracy: 0.7947061189713768


* ##### naive-bayes 

In [13]:
nb_model = MultinomialNB()

# Train the classifier
nb_model.fit(train_features, train_labels)

In [14]:
predictions = nb_model.predict(test_features)

nb_p_score = precision_score(test_labels, predictions)
nb_r_score = recall_score(test_labels, predictions)
nb_f_score = f1_score(test_labels, predictions)
nb_a_score = accuracy_score(test_labels, predictions)
print("Precision:", nb_p_score)
print("Recall:", nb_r_score)
print("F1-Score:", nb_f_score)
print("Accuracy:", nb_a_score)

Precision: 0.7761598785086684
Recall: 0.769608928650516
F1-Score: 0.772870522155719
Accuracy: 0.7735546665906506


* ##### Logistic Regression

In [15]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)

# Train the classifier
lr_model.fit(train_features, train_labels)

In [16]:
predictions = lr_model.predict(test_features)

lr_p_score = precision_score(test_labels, predictions)
lr_r_score = recall_score(test_labels, predictions)
lr_f_score = f1_score(test_labels, predictions)
lr_a_score = accuracy_score(test_labels, predictions)
print("Precision:", lr_p_score)
print("Recall:", lr_r_score)
print("F1-Score:", lr_f_score)
print("Accuracy:", lr_a_score)

Precision: 0.7888736636792336
Recall: 0.8086338127083953
F1-Score: 0.7986315279708809
Accuracy: 0.7958621956588529


### Evaluate score

In [23]:
model_metrics = {
    'Model': ['SVM', 'Naive-Bayes', 'Logistic Regression'],
    'Precision': [svm_p_score, nb_p_score, lr_p_score],
    'Recall': [svm_r_score, nb_r_score, lr_r_score],
    'F1-Score': [svm_f_score, nb_f_score, lr_f_score],
    'Accuracy': [svm_a_score, nb_a_score, lr_a_score]
}

metrics_df = pd.DataFrame(model_metrics)

In [24]:
metrics_df

Unnamed: 0,Model,Precision,Recall,F1-Score,Accuracy
0,SVM,0.78661,0.809513,0.797897,0.794706
1,Naive-Bayes,0.77616,0.769609,0.772871,0.773555
2,Logistic Regression,0.788874,0.808634,0.798632,0.795862
