<a href="https://colab.research.google.com/github/Iannoh-png/Week-3-Assignment-AI-Module/blob/main/Amazon_Product_Review_Analysis_with_spaCy_(NER_%26_Rule_based_Sentiment).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import pandas as pd # Import pandas for potential CSV/delimited files if file format changes
import os # For checking file existence

# Load the small English spaCy model.
# If you haven't downloaded it yet, run: python -m spacy download en_core_web_sm
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' loaded successfully.")
except OSError:
    print("spaCy model 'en_core_web_sm' not found. Please run 'python -m spacy download en_core_web_sm' and try again.")
    exit()

# --- Load Review Data from Files ---
# IMPORTANT:
# Make sure 'train.ft' and 'test.ft' are in the same directory as this script,
# or provide the full paths to these files.

file_paths = {
    'train': 'train.ft',  # <<< REPLACE WITH ACTUAL PATH IF NOT IN CURRENT DIR
    'test': 'test.ft'     # <<< REPLACE WITH ACTUAL PATH IF NOT IN CURRENT DIR
}

# Create dummy files for demonstration if they don't exist
# In a real scenario, you would have your actual data files.
for key, path in file_paths.items():
    if not os.path.exists(path):
        print(f"Creating a dummy file '{path}' for demonstration purposes.")
        with open(path, 'w', encoding='utf-8') as f:
            if key == 'train':
                f.write("I absolutely love the new Apple iPhone 15! The camera is incredible, and the battery life is surprisingly good. Highly recommend this amazing product.\n")
                f.write("The Samsung Galaxy S23 Ultra has an excellent display, but the software updates are a bit slow.\n")
                f.write("This cheap headphone from Sony broke quickly. Very disappointing quality.\n")
            else: # test file
                f.write("My new Dell XPS laptop is fantastic for work. Best purchase this year.\n")
                f.write("The Amazon Echo Dot is a convenient smart speaker, but sometimes it doesn't understand me.\n")
                f.write("Google Pixel 8 camera is good, but the battery drains fast.\n")
        print(f"Dummy file '{path}' created. Please replace its content with your actual review data or provide your own file.")


all_reviews_data = {}

for split_name, path in file_paths.items():
    current_reviews = []
    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                stripped_line = line.strip()
                if stripped_line: # Add non-empty lines
                    current_reviews.append(stripped_line)
        print(f"Successfully loaded {len(current_reviews)} reviews from '{path}'.")
        all_reviews_data[split_name] = current_reviews

    except FileNotFoundError:
        print(f"Error: The file '{path}' was not found. Please check the file path.")
        exit()
    except Exception as e:
        print(f"An error occurred while reading the file '{path}': {e}")
        exit()

# Check if any reviews were loaded
if not all_reviews_data.get('train') and not all_reviews_data.get('test'):
    print("No reviews found in either file. Please ensure the files contain review text.")
    exit()

# --- Process reviews from both train and test files ---

overall_results = {}

for split_name, reviews_list in all_reviews_data.items():
    print(f"\n--- Analyzing Reviews from {split_name.upper()} Dataset ({len(reviews_list)} reviews): ---\n")

    current_split_results = []

    for i, review_text in enumerate(reviews_list):
        print(f"\n--- Processing Review {i+1} ({split_name.upper()}): ---\n'{review_text}'")

        # --- Named Entity Recognition (NER) ---
        doc = nlp(review_text)

        current_entities = []
        print("--- Extracted Named Entities (Products & Brands): ---")
        found_entities_for_review = False
        for ent in doc.ents:
            if ent.label_ in ["ORG", "PRODUCT", "GPE", "NORP"]: # Added NORP which can sometimes capture nationality-based brands
                print(f"  Entity: '{ent.text}' (Type: {ent.label_})")
                current_entities.append(ent.text)
                found_entities_for_review = True
        if not found_entities_for_review:
            print("  No common product/brand entities found by spaCy's default NER for this review.")

        # --- Rule-based Sentiment Analysis ---
        positive_keywords = ["love", "incredible", "good", "amazing", "recommend", "great", "excellent", "fantastic", "happy", "best"]
        negative_keywords = ["high", "expensive", "bad", "poor", "terrible", "disappointing", "hate", "problem", "slow", "broke", "cheap", "drains"]

        lower_review = review_text.lower()
        positive_score = 0
        negative_score = 0

        for keyword in positive_keywords:
            positive_score += lower_review.count(keyword)

        for keyword in negative_keywords:
            negative_score += lower_review.count(keyword)

        print("\n--- Sentiment Analysis (Rule-based): ---")
        print(f"  Positive keywords found: {positive_score}")
        print(f"  Negative keywords found: {negative_score}")

        sentiment = "Neutral"
        if positive_score > negative_score:
            sentiment = "Positive"
        elif negative_score > positive_score:
            sentiment = "Negative"

        print(f"  Overall Sentiment: {sentiment}")

        current_split_results.append({
            "review": review_text,
            "extracted_entities": current_entities,
            "sentiment": sentiment
        })
    overall_results[split_name] = current_split_results

print("\n--- Overall Analysis Summary ---")
for split_name, results_list in overall_results.items():
    print(f"\n--- Summary for {split_name.upper()} Dataset ({len(results_list)} reviews) ---")
    for i, res in enumerate(results_list):
        print(f"\nReview {i+1}: '{res['review']}'")
        print(f"  Extracted Entities: {res['extracted_entities']}")
        print(f"  Determined Sentiment: {res['sentiment']}")

spaCy model 'en_core_web_sm' loaded successfully.
Creating a dummy file 'train.ft' for demonstration purposes.
Dummy file 'train.ft' created. Please replace its content with your actual review data or provide your own file.
Creating a dummy file 'test.ft' for demonstration purposes.
Dummy file 'test.ft' created. Please replace its content with your actual review data or provide your own file.
Successfully loaded 3 reviews from 'train.ft'.
Successfully loaded 3 reviews from 'test.ft'.

--- Analyzing Reviews from TRAIN Dataset (3 reviews): ---


--- Processing Review 1 (TRAIN): ---
'I absolutely love the new Apple iPhone 15! The camera is incredible, and the battery life is surprisingly good. Highly recommend this amazing product.'
--- Extracted Named Entities (Products & Brands): ---
  Entity: 'Apple' (Type: ORG)
  Entity: 'iPhone 15' (Type: PRODUCT)

--- Sentiment Analysis (Rule-based): ---
  Positive keywords found: 5
  Negative keywords found: 1
  Overall Sentiment: Positive

--- Pro