In [None]:
# 📘 Task 3: NLP with spaCy – Amazon Reviews (Train + Test)
# ================================================
# Goal:
# - Perform Named Entity Recognition (NER) to extract product/brand names
# - Analyze sentiment using a simple rule-based approach
# - Handle large train/test text datasets efficiently by loading samples
# ================================================

# STEP 1: Install dependencies
%pip install spacy pandas tqdm

# Download the English NLP model (only once)
!python -m spacy download en_core_web_sm

# STEP 2: Import modules
import spacy
import pandas as pd
from tqdm import tqdm

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')




In [2]:
# STEP 3: Load partial data from train/test files
# ================================================
train_path = "Data sets/train.ft.txt"
test_path = "Data sets/test.ft.txt"

# Number of lines to sample from each file
sample_size = 8000  # Adjust (e.g., 5000, 10000) depending on RAM

def load_sample(file_path, sample_size):
    """Load only a subset of a large text file."""
    reviews = []
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= sample_size:
                break
            if line.strip():
                reviews.append(line.strip())
    print(f"✅ Loaded {len(reviews)} lines from {file_path}")
    return reviews

train_reviews = load_sample(train_path, sample_size)
test_reviews = load_sample(test_path, int(sample_size / 2))  # smaller test sample

✅ Loaded 8000 lines from Data sets/train.ft.txt
✅ Loaded 4000 lines from Data sets/test.ft.txt


In [3]:
# STEP 4: Rule-Based Sentiment Analysis
# ================================================
positive_words = [
    "love", "great", "amazing", "good", "excellent", "happy",
    "fast", "satisfied", "recommend", "perfect", "awesome"
]
negative_words = [
    "bad", "terrible", "poor", "worst", "slow", "hate",
    "disappointed", "not good", "dies", "broken", "sad"
]

def get_sentiment(text):
    text_lower = text.lower()
    pos = sum(word in text_lower for word in positive_words)
    neg = sum(word in text_lower for word in negative_words)
    if pos > neg:
        return "Positive"
    elif neg > pos:
        return "Negative"
    else:
        return "Neutral"

In [4]:
# STEP 5: Function for NER + Sentiment
# ================================================
def analyze_reviews(reviews, dataset_name="Data"):
    results = []
    for review in tqdm(reviews, desc=f"Processing {dataset_name}"):
        doc = nlp(review)
        entities = [ent.text for ent in doc.ents if ent.label_ in ["PRODUCT", "ORG"]]
        sentiment = get_sentiment(review)
        results.append({
            "review": review,
            "entities": entities,
            "sentiment": sentiment
        })
    return results

# Run analysis on both datasets
train_results = analyze_reviews(train_reviews, "Train Set")
test_results = analyze_reviews(test_reviews, "Test Set")

Processing Train Set: 100%|██████████| 8000/8000 [05:35<00:00, 23.83it/s]
Processing Test Set: 100%|██████████| 4000/4000 [02:46<00:00, 23.98it/s]


In [5]:
# STEP 6: Display sample outputs
# ================================================
print("\n🔹 SAMPLE TRAIN RESULTS:")
for i, res in enumerate(train_results[:3]):
    print(f"\nReview {i+1}: {res['review']}")
    print(f"Entities: {res['entities']}")
    print(f"Sentiment: {res['sentiment']}")
    print("=" * 70)

print("\n🔹 SAMPLE TEST RESULTS:")
for i, res in enumerate(test_results[:3]):
    print(f"\nReview {i+1}: {res['review']}")
    print(f"Entities: {res['entities']}")
    print(f"Sentiment: {res['sentiment']}")
    print("=" * 70)



🔹 SAMPLE TRAIN RESULTS:

Review 1: __label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
Entities: ['Chrono Cross']
Sentiment: Negative

Review 2: __label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is 

In [6]:
# STEP 7: Save outputs for report
# ================================================
pd.DataFrame(train_results).to_csv("train_analysis_sample.csv", index=False, encoding="utf-8")
pd.DataFrame(test_results).to_csv("test_analysis_sample.csv", index=False, encoding="utf-8")

print("\n✅ Analysis complete! Files saved:")
print("- train_analysis_sample.csv")
print("- test_analysis_sample.csv")


✅ Analysis complete! Files saved:
- train_analysis_sample.csv
- test_analysis_sample.csv


In [None]:
# STEP 8: Ethical Considerations(Report)
# ================================================
"""
Ethical Considerations:
- The Amazon Reviews dataset may reflect bias in product reviews, such as
  regional language differences or sentiment imbalance across brands.
- A rule-based sentiment system might misinterpret sarcasm or slang.
- spaCy’s NER can misidentify entities if product names overlap with common words.
- Tools like TensorFlow Fairness Indicators can help visualize bias across
  product types (e.g., electronics vs. beauty items) and guide fair adjustments.
"""