# PROJECT A: Lexicon-based SA ( domain lexicon vs public lexicon)


In [4]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [10]:
# Ensure NLTK resources are downloaded
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Load the dataset
tweets_df = pd.read_csv('tweets.csv')
tweets_df

Unnamed: 0,Tweet ID,Text,User,Created At,Likes,Retweets,Sentiment
0,449211727471646420,Feeling grateful for my friends and family.,werickson,2023-01-13 00:35:08,156,489,positive
1,519036665081652813,Going for a walk in the park.,jennybutler,2023-02-16 06:24:30,223,788,neutral
2,776023316169815671,I hate it when things don't go my way.,william88,2023-01-24 18:12:37,332,860,negative
3,674750468135750054,I hate it when things don't go my way.,lawrencebauer,2023-02-09 07:14:24,388,881,negative
4,859726107390311299,This is the best day ever!,gerald07,2023-02-28 06:55:54,255,567,positive
...,...,...,...,...,...,...,...
995,250464848751217010,I hate it when things don't go my way.,nhayes,2023-01-28 05:03:18,986,932,negative
996,600819966000157055,I hate it when things don't go my way.,marknixon,2023-04-21 13:27:44,458,61,negative
997,966366146192109165,I'm so upset right now.,hollyflores,2023-03-08 11:29:25,317,179,negative
998,936627265507507170,Just had lunch with a friend.,odickerson,2023-04-09 18:32:54,584,706,neutral


In [7]:
# Load the text data
tweets_text = tweets_df['Text']

In [8]:
# Define preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming (optional, uncomment to use)
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]
    # Apply lemmatization (optional, uncomment to use)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [11]:
# Apply preprocessing to all tweets
tweets_df['Processed Text'] = tweets_text.apply(preprocess_text)

# Save the processed DataFrame to inspect the results
tweets_df[['Text', 'Processed Text', 'Sentiment']].head()

Unnamed: 0,Text,Processed Text,Sentiment
0,Feeling grateful for my friends and family.,feeling grateful friend family,positive
1,Going for a walk in the park.,going walk park,neutral
2,I hate it when things don't go my way.,hate thing dont go way,negative
3,I hate it when things don't go my way.,hate thing dont go way,negative
4,This is the best day ever!,best day ever,positive


In [12]:
# Create a custom lexicon from the processed text
def create_custom_lexicon(df):
    lexicon = {}

    # Group words by sentiment
    for sentiment in ['positive', 'negative', 'neutral']:
        sentiment_words = df[df['Sentiment'] == sentiment]['Processed Text'].str.split().sum()
        for word in sentiment_words:
            if word not in lexicon:
                lexicon[word] = {'positive': 0, 'negative': 0, 'neutral': 0}
            lexicon[word][sentiment] += 1

    # Assign sentiment scores (+1 for positive, -1 for negative, 0 for neutral)
    custom_lexicon = {}
    for word, counts in lexicon.items():
        score = counts['positive'] - counts['negative']
        custom_lexicon[word] = score

    return custom_lexicon

In [13]:
# Generate the custom lexicon
custom_lexicon = create_custom_lexicon(tweets_df)

# Save the custom lexicon to inspect
custom_lexicon

{'feeling': -36,
 'grateful': 102,
 'friend': 102,
 'family': 102,
 'best': 110,
 'day': 110,
 'ever': 110,
 'love': 120,
 'life': 120,
 'hate': -145,
 'thing': -145,
 'dont': -145,
 'go': -145,
 'way': -145,
 'disappointed': -138,
 'im': -139,
 'upset': -139,
 'right': -139,
 'going': 0,
 'walk': 0,
 'park': 0,
 'lunch': 0,
 'watching': 0,
 'movie': 0,
 'tonight': 0}

In [14]:
# Apply custom lexicon to calculate sentiment
def calculate_sentiment_custom(text, lexicon):
    tokens = text.split()
    sentiment_score = sum([lexicon.get(word, 0) for word in tokens])
    if sentiment_score > 0:
        return 'positive'
    elif sentiment_score < 0:
        return 'negative'
    else:
        return 'neutral'

In [15]:
# Apply custom lexicon to dataset
tweets_df['Custom Sentiment'] = tweets_df['Processed Text'].apply(lambda x: calculate_sentiment_custom(x, custom_lexicon))

# Compare results
tweets_df[['Text', 'Sentiment', 'Custom Sentiment']].head()

Unnamed: 0,Text,Sentiment,Custom Sentiment
0,Feeling grateful for my friends and family.,positive,positive
1,Going for a walk in the park.,neutral,neutral
2,I hate it when things don't go my way.,negative,negative
3,I hate it when things don't go my way.,negative,negative
4,This is the best day ever!,positive,positive


In [16]:
# Evaluate the accuracy of the custom lexicon
custom_accuracy = (tweets_df['Custom Sentiment'] == tweets_df['Sentiment']).mean()
print(f"Custom Lexicon Accuracy: {custom_accuracy * 100:.2f}%")

Custom Lexicon Accuracy: 92.40%


In [17]:
# Incorporate VADER for comparison
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [18]:
# Initialize VADER
vader = SentimentIntensityAnalyzer()

def calculate_sentiment_vader(text):
    scores = vader.polarity_scores(text)
    if scores['compound'] > 0.05:
        return 'positive'
    elif scores['compound'] < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply VADER to dataset
tweets_df['VADER Sentiment'] = tweets_df['Text'].apply(calculate_sentiment_vader)

In [19]:
# Evaluate the accuracy of VADER
vader_accuracy = (tweets_df['VADER Sentiment'] == tweets_df['Sentiment']).mean()
print(f"VADER Accuracy: {vader_accuracy * 100:.2f}%")

# Save results for inspection
tweets_df[['Text', 'Sentiment', 'Custom Sentiment', 'VADER Sentiment']].head()

VADER Accuracy: 92.40%


Unnamed: 0,Text,Sentiment,Custom Sentiment,VADER Sentiment
0,Feeling grateful for my friends and family.,positive,positive,positive
1,Going for a walk in the park.,neutral,neutral,neutral
2,I hate it when things don't go my way.,negative,negative,negative
3,I hate it when things don't go my way.,negative,negative,negative
4,This is the best day ever!,positive,positive,positive



NAME: Mohamed Moubarak Mohamed Misbahou Mkouboi<br>
MATRIC NO: P139575<br>