### Test

In [1]:
import aspect_based_sentiment_analysis as absa

nlp = absa.load()
text = ("We are great fans of Slack, but we wish the subscriptions "
        "were more accessible to small startups.")

slack, price = nlp(text, aspects=['slack', 'price'])
assert price.sentiment == absa.Sentiment.negative
assert slack.sentiment == absa.Sentiment.positive


  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 1.08k/1.08k [00:00<?, ?B/s]
Downloading: 100%|██████████| 438M/438M [00:19<00:00, 23.0MB/s] 
Some layers from the model checkpoint at absa/classifier-rest-0.2 were not used when initializing BertABSClassifier: ['dropout_379']
- This IS expected if you are initializing BertABSClassifier from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertABSClassifier from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of BertABSClassifier were not initialized from the model checkpoint at absa/classifier-rest-0.2 and are newly initialized: ['dropout_37']
You should probably TRAIN this model on a down-stream task to be able to us

In [3]:
print(f"Aspect: Price - Sentiment: {price.sentiment}")
print(f"Aspect: Slack - Sentiment: {slack.sentiment}")

Aspect: Price - Sentiment: 1
Aspect: Slack - Sentiment: 2


## Trial on twitter_sentiment_data.csv

In [5]:
pip install pandas

Collecting pandas
  Downloading pandas-1.3.5-cp37-cp37m-win_amd64.whl (10.0 MB)
     --------------------------------------- 10.0/10.0 MB 29.0 MB/s eta 0:00:00
Installing collected packages: pandas
Successfully installed pandas-1.3.5
Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install textblob

Collecting textblobNote: you may need to restart the kernel to use updated packages.

  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     -------------------------------------- 636.8/636.8 kB 7.9 MB/s eta 0:00:00
Collecting nltk>=3.1
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 19.2 MB/s eta 0:00:00
Installing collected packages: nltk, textblob
Successfully installed nltk-3.8.1 textblob-0.17.1


### Read and pre-process data to remove stopwords and less informative tokens found by observation

In [None]:
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Read CSV file
df = pd.read_csv('twitter_sentiment_data.csv',nrows=1000)

# Custom list of words to remove
custom_stopwords = ["climate", "change", "rt", "amp"]


# Preprocess tweets
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and token.lemma_ not in custom_stopwords]
    
    return " ".join(tokens)

df['processed_text'] = df['message'].apply(preprocess)



### Trial 1.1 Extract aspects with LDA

In [15]:
# Extract aspects using LDA
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(df['processed_text'])
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Get top words for each topic
def get_top_words(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
    return topics

feature_names = vectorizer.get_feature_names()
topics = get_top_words(lda, feature_names, 10)
print (topics)

[['global', 'warming', 'trump', 'china', 'plan', 'exit', 'pact', 'rare', 'criticize', 'nt'], ['warming', 'global', 'documentary', 'leonardo', 'watch', 'dicaprio', 'cover', 'dicaprios', 'free', 'issue'], ['nt', 'believe', 'need', 'real', 'november', 'people', 'global', 'winter', 'shift', 'warming'], ['right', 'world', 'htt', 'watch', 'tackle', 'travel', 'mention', 'fight', 'cable', 'protect'], ['global', 'warming', 'hoax', 'help', 'trump', 'clinton', 'chinese', 'like', 'combat', 'email']]


### use the sentiment score provided in the database

- 2(News): the tweet links to factual news about climate change
- 1(Pro): the tweet supports the belief of man-made climate change
- 0(Neutral: the tweet neither supports nor refutes the belief of man-made climate change
- -1(Anti): the tweet does not believe in man-made climate change

In [16]:

# Perform sentiment analysis
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['sentiment_score'] = df['message'].apply(get_sentiment)

# Assign topics to tweets
doc_topics = lda.transform(doc_term_matrix)
df['topic'] = doc_topics.argmax(axis=1)

# Perform ABSA
results = []
for topic_idx, topic_words in enumerate(topics):
    topic_tweets = df[df['topic'] == topic_idx]
    avg_sentiment = topic_tweets['sentiment_score'].mean()
    results.append({
        'aspect': ', '.join(topic_words[:5]),
        'sentiment': avg_sentiment,
        'tweet_count': len(topic_tweets)
    })

# Display results
for result in results:
    print(f"Aspect: {result['aspect']}")
    print(f"Average Sentiment: {result['sentiment']:.2f}")
    print(f"Tweet Count: {result['tweet_count']}")
    print()


Aspect: global, warming, trump, china, plan
Average Sentiment: 0.08
Tweet Count: 222

Aspect: warming, global, documentary, leonardo, watch
Average Sentiment: 0.08
Tweet Count: 227

Aspect: nt, believe, need, real, november
Average Sentiment: 0.04
Tweet Count: 183

Aspect: right, world, htt, watch, tackle
Average Sentiment: 0.20
Tweet Count: 187

Aspect: global, warming, hoax, help, trump
Average Sentiment: 0.05
Tweet Count: 181



### Trial 1.2 Manually define aspects and use VADER sentiment library 

In [18]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
     -------------------------------------- 126.0/126.0 kB 7.2 MB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load your data
df = pd.read_csv('twitter_sentiment_data.csv', nrows=1000)  # Using top 1000 rows as an example

# Define aspects and associated keywords (as in the previous example)
aspects = {
    "Politics": ["policy", "government", "legislation", "trump", "election"],
    "Science": ["research", "study", "scientist", "evidence", "data"],
    "Economy": ["business", "industry", "economy", "cost", "investment"],
    "Environment": ["ecosystem", "biodiversity", "conservation", "pollution"],
    "Energy": ["renewable", "fossil fuels", "solar", "wind", "carbon"],
    "Activism": ["protest", "campaign", "awareness", "action", "movement"],
    "Technology": ["innovation", "solution", "technology", "development"],
    "Health": ["public health", "disease", "impact", "risk", "adaptation"],
    "Education": ["awareness", "curriculum", "education", "inform", "learn"]
}

def assign_aspect(tweet):
    tweet_lower = tweet.lower()
    for aspect, keywords in aspects.items():
        if any(keyword in tweet_lower for keyword in keywords):
            return aspect
    return "Other"

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment_scores(text):
    return sid.polarity_scores(text)

# Apply aspect assignment and sentiment analysis
df['aspect'] = df['message'].apply(assign_aspect)
df['sentiment_scores'] = df['message'].apply(get_sentiment_scores)

# Extract compound sentiment score
df['sentiment'] = df['sentiment_scores'].apply(lambda x: x['compound'])

# Calculate average sentiment for each aspect
aspect_results = df.groupby('aspect').agg({
    'sentiment': 'mean',
    'message': 'count'
}).reset_index()

# Display results
for _, row in aspect_results.iterrows():
    print(f"Aspect: {row['aspect']}")
    print(f"Average Sentiment: {row['sentiment']:.2f}")
    print(f"Tweet Count: {row['message']}")
    print()


Aspect: Activism
Average Sentiment: 0.07
Tweet Count: 28

Aspect: Economy
Average Sentiment: -0.13
Tweet Count: 19

Aspect: Education
Average Sentiment: 0.34
Tweet Count: 3

Aspect: Energy
Average Sentiment: 0.26
Tweet Count: 8

Aspect: Environment
Average Sentiment: -0.02
Tweet Count: 5

Aspect: Health
Average Sentiment: 0.02
Tweet Count: 18

Aspect: Other
Average Sentiment: 0.04
Tweet Count: 738

Aspect: Politics
Average Sentiment: -0.12
Tweet Count: 131

Aspect: Science
Average Sentiment: -0.02
Tweet Count: 47

Aspect: Technology
Average Sentiment: 0.31
Tweet Count: 3

