# Sentiment analysis (NLTK + weighted lexicon)

Sentiment classification of 6 student reviews into Positive/Neutral/Negative.


## Setup and imports


In [3]:
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# First-time downloads (safe to run multiple times)
nltk.download("punkt")
nltk.download("stopwords")

stemmer = PorterStemmer()
STOPWORDS = set(stopwords.words("english"))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gunja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gunja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data (reviews)


In [4]:
reviews = [
    "The library is absolutely amazing! The staff are incredibly helpful, the Wi-Fi is fast, and there are plenty of quiet study areas. I'm so grateful to have this resource on campus.",
    "Terrible experience today. Too crowded, couldn't find a seat anywhere, and the computers are constantly freezing. Very disappointing and frustrating.",
    "The library has the books I need and the computers usually work. It's adequate for my studies but nothing exceptional. Could use better facilities.",
    "Brilliant new study pods! Perfect for group work and very quiet. The extended opening hours during exam season are fantastic. Really impressed with the improvements.",
    "Very disappointed with noise levels during peak hours. People talking loudly, not enough enforcement of quiet zones. The atmosphere is too chaotic for serious studying.",
    "Excellent resource! Best place on campus for concentrated work. The librarians are supportive and knowledgeable. Highly recommend for any student needing a productive study environment."
]


## Weighted sentiment lexicon

In [5]:
# Weighted lexicon (coursework-style)
positive_words = {
    "amazing": 2, "excellent": 2, "brilliant": 2, "fantastic": 2, "helpful": 2,
    "grateful": 1, "impressed": 1, "best": 2, "love": 2, "perfect": 2,
    "wonderful": 2, "great": 1, "good": 1, "recommend": 2, "fast": 1,
    "supportive": 2, "knowledgeable": 1, "productive": 1, "adequate": 1
}

negative_words = {
    "terrible": 2, "disappointing": 2, "disappointed": 2, "frustrated": 1,
    "frustrating": 2, "crowded": 2, "freezing": 1, "chaotic": 2,
    "inadequate": 1, "poor": 1, "bad": 1, "awful": 2, "horrible": 2,
    "useless": 2, "noise": 1, "problem": 1
}


## Preprocessing (NLTK)

In [6]:
def preprocess_text(text: str):
    # Tokenize
    tokens = word_tokenize(text)

    # Lowercase, keep alphabetic tokens, remove stopwords
    tokens = [t.lower() for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in STOPWORDS]

    # Stem
    tokens = [stemmer.stem(t) for t in tokens]

    return tokens


## Scoring + classification

In [7]:
def stem_lexicon(lexicon: dict):
    return {stemmer.stem(word): weight for word, weight in lexicon.items()}

POS_LEX = stem_lexicon(positive_words)
NEG_LEX = stem_lexicon(negative_words)

def classify_sentiment_weighted(text: str):
    tokens = preprocess_text(text)

    pos_score = sum(POS_LEX.get(tok, 0) for tok in tokens)
    neg_score = sum(NEG_LEX.get(tok, 0) for tok in tokens)

    total_score = pos_score - neg_score

    # Simple rule-based label
    if total_score > 0:
        label = "Positive"
    elif total_score < 0:
        label = "Negative"
    else:
        label = "Neutral"

    return pos_score, neg_score, total_score, label


## Results and summary statistics


In [8]:
rows = []
for i, r in enumerate(reviews, start=1):
    pos, neg, total, label = classify_sentiment_weighted(r)
    rows.append({
        "review_id": i,
        "review": r,
        "pos_score": pos,
        "neg_score": neg,
        "total_score": total,
        "label": label
    })

df = pd.DataFrame(rows)
df[["review_id", "pos_score", "neg_score", "total_score", "label"]]


Unnamed: 0,review_id,pos_score,neg_score,total_score,label
0,1,6,0,6,Positive
1,2,0,9,-9,Negative
2,3,1,0,1,Positive
3,4,7,0,7,Positive
4,5,0,5,-5,Negative
5,6,10,0,10,Positive


In [9]:
summary = df["label"].value_counts().rename_axis("label").reset_index(name="count")
summary["percent"] = (summary["count"] / summary["count"].sum() * 100).round(1)
summary


Unnamed: 0,label,count,percent
0,Positive,4,66.7
1,Negative,2,33.3


## Notes / limitations


- This is a rule-based approach, so it depends on the coverage/quality of the lexicon.
- Stemming simplifies matching but may lose nuance.
- Next step (separate ML project): TF-IDF + logistic regression baseline, then compare performance.
