<a href="https://colab.research.google.com/github/MohanZhu0623/Sentiment_Analysis/blob/main/SentimentLexicon(AFINN%26VADER)_Threeclass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install afinn
!pip install vaderSentiment
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [10]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Load the dataset
data = pd.read_excel('/content/python_labelled_data.xlsx')

In [11]:
# Preprocess data
def pre_process_data(dataset):
    # Convert to lowercase
    dataset['text'] = dataset['text'].str.lower()
    # Remove numbers
    dataset['text'] = dataset['text'].str.replace(r'\d+', '', regex=True)
    # Remove punctuation
    dataset['text'] = dataset['text'].str.replace(f"[{string.punctuation}]", "", regex=True)
    # Remove whitespace
    dataset['text'] = dataset['text'].str.strip()
    # Remove stopwords except 'not'
    stop_words = set(stopwords.words('english'))
    stop_words.remove('not')
    dataset['text'] = dataset['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
    return dataset

# Apply preprocessing to the data
data = pre_process_data(data)
print(data.head())

                                                text  class
0  project designed help protect environment usin...      1
1  help us built sustainable studio eliminate cla...      1
2  paint something dont want explain isbob ross b...      0
3  free app allow pool reservations others get gr...      1
4  prohibition themed gastro pub dark silent head...      1


In [17]:
# Split the labeled data into training set (80%) and hold-out test set (20%)
partition = 0.8
train_data, test_data = train_test_split(data, test_size=1-partition, random_state=128, stratify=data['class'])
print(f"Training set size: {train_data.shape}")
print(f"Test set size: {test_data.shape}")

Training set size: (1815, 2)
Test set size: (454, 2)


In [14]:
# Function to classify text using AFINN
afinn = Afinn()

def classify_afinn(text):
    score = afinn.score(text)
    if score > 0:
        return 1
    elif score < 0:
        return -1
    else:
        return 0

In [15]:
# Function to classify text using VADER
analyzer = SentimentIntensityAnalyzer()

def classify_vader(text):
    scores = analyzer.polarity_scores(text)
    compound = scores['compound']
    if compound > 0.05:
        return 1
    elif compound < -0.05:
        return -1
    else:
        return 0

In [18]:
# Apply classification to the test set using AFINN
test_data['afinn_predicted'] = test_data['text'].apply(classify_afinn)

# Apply classification to the test set using VADER
test_data['vader_predicted'] = test_data['text'].apply(classify_vader)

In [19]:
# Calculate metrics for AFINN
afinn_accuracy = accuracy_score(test_data['class'], test_data['afinn_predicted'])
afinn_precision = precision_score(test_data['class'], test_data['afinn_predicted'], average='weighted')
afinn_recall = recall_score(test_data['class'], test_data['afinn_predicted'], average='weighted')
afinn_f1 = f1_score(test_data['class'], test_data['afinn_predicted'], average='weighted')

# Calculate metrics for VADER
vader_accuracy = accuracy_score(test_data['class'], test_data['vader_predicted'])
vader_precision = precision_score(test_data['class'], test_data['vader_predicted'], average='weighted')
vader_recall = recall_score(test_data['class'], test_data['vader_predicted'], average='weighted')
vader_f1 = f1_score(test_data['class'], test_data['vader_predicted'], average='weighted')

# Print the results
print("AFINN Metrics:")
print(f"Accuracy: {afinn_accuracy}")
print(f"Precision: {afinn_precision}")
print(f"Recall: {afinn_recall}")
print(f"F1 Score: {afinn_f1}")

print("\nVADER Metrics:")
print(f"Accuracy: {vader_accuracy}")
print(f"Precision: {vader_precision}")
print(f"Recall: {vader_recall}")
print(f"F1 Score: {vader_f1}")

AFINN Metrics:
Accuracy: 0.6013215859030837
Precision: 0.6093346123584772
Recall: 0.6013215859030837
F1 Score: 0.5909687502472332

VADER Metrics:
Accuracy: 0.6167400881057269
Precision: 0.6449829758707675
Recall: 0.6167400881057269
F1 Score: 0.5923037072296562
