## Task 3

In [9]:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import pandas as pd
import spacy
from spacy import displacy
from collections import defaultdict
import random

In [10]:
# File paths (assuming files are in working directory)
train_path = "train.ft.txt"
test_path = "test.ft.txt"

# Function to read large files efficiently
def read_large_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            yield line

# Load the dataset
try:
    print("Loading training data...")
    train_lines = []
    for i, line in enumerate(read_large_file(train_path)):
        if i >= 100000:  # Limit to first 100k lines for memory
            break
        train_lines.append(line)
    
    print("Loading test data...")
    test_lines = []
    for i, line in enumerate(read_large_file(test_path)):
        if i >= 20000:  # Limit to first 20k lines
            break
        test_lines.append(line)
    
    # Convert to DataFrame
    print("Processing data...")
    train_df = pd.DataFrame([line.split(' ', 1) for line in train_lines], columns=['sentiment', 'text'])
    test_df = pd.DataFrame([line.split(' ', 1) for line in test_lines], columns=['sentiment', 'text'])
    
    # Clean the data
    train_df['text'] = train_df['text'].str.strip()
    test_df['text'] = test_df['text'].str.strip()
    
    # Convert sentiment labels (__label__1 = negative, __label__2 = positive)
    train_df['sentiment'] = train_df['sentiment'].str.replace('__label__', '').map({'1': 'negative', '2': 'positive'})
    test_df['sentiment'] = test_df['sentiment'].str.replace('__label__', '').map({'1': 'negative', '2': 'positive'})
    
except FileNotFoundError:
    print("\nERROR: Could not find the files in the working directory.")
    print("Please ensure you have both files in your current directory:")
    print("- train.ft.txt")
    print("- test.ft.txt")
    raise

Loading training data...
Loading test data...
Processing data...


In [11]:
# Take a smaller sample for demonstration
sample_size = 500
train_sample = train_df.sample(sample_size, random_state=42)
test_sample = test_df.sample(sample_size, random_state=42)

In [12]:
# Combine samples for NER analysis
reviews = pd.concat([train_sample, test_sample])['text'].tolist()

# Load the English language model
print("Loading spaCy model...")
nlp = spacy.load("en_core_web_sm")

# Add entity ruler for product names and brands
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define patterns for common products and brands
patterns = [
    {"label": "PRODUCT", "pattern": "Kindle"},
    {"label": "PRODUCT", "pattern": "Echo"},
    {"label": "PRODUCT", "pattern": "Fire TV"},
    {"label": "PRODUCT", "pattern": "Fire Stick"},
    {"label": "PRODUCT", "pattern": "iPad"},
    {"label": "PRODUCT", "pattern": "iPhone"},
    {"label": "PRODUCT", "pattern": "Galaxy"},
    {"label": "PRODUCT", "pattern": "MacBook"},
    {"label": "PRODUCT", "pattern": "PlayStation"},
    {"label": "PRODUCT", "pattern": "Xbox"},
    {"label": "PRODUCT", "pattern": "Nintendo Switch"},
    {"label": "BRAND", "pattern": [{"LOWER": "amazon"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "apple"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "samsung"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "sony"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "microsoft"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "lg"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "dell"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "hp"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "lenovo"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "logitech"}]},
    {"label": "BRAND", "pattern": [{"LOWER": "bose"}]}
]
ruler.add_patterns(patterns)

Loading spaCy model...


#### Enhanced sentiment analysis with rule-based approach

In [13]:
def analyze_sentiment(doc):
    # Define sentiment indicators
    positive_words = ["excellent", "amazing", "great", "love", "wonderful", 
                     "awesome", "perfect", "fantastic", "recommend", "good",
                     "best", "superb", "outstanding", "pleased", "happy",
                     "impressed", "satisfied", "working", "fine", "nice"]
    
    negative_words = ["terrible", "awful", "horrible", "bad", "disappointing",
                     "poor", "waste", "broke", "broken", "return", "junk",
                     "worst", "useless", "faulty", "defective", "avoid",
                     "disgusting", "unacceptable", "failed", "damaged", "crap"]
    
    # Count positive and negative indicators
    positive_count = sum(1 for token in doc if token.text.lower() in positive_words)
    negative_count = sum(1 for token in doc if token.text.lower() in negative_words)
    
    # Check for negation patterns
    for i, token in enumerate(doc):
        if token.text.lower() in ["not", "n't", "never", "no"] and i < len(doc) - 1:
            next_token = doc[i+1].text.lower()
            if next_token in positive_words:
                negative_count += 1
                positive_count = max(0, positive_count - 1)
            elif next_token in negative_words:
                positive_count += 1
                negative_count = max(0, negative_count - 1)
    
    # Determine sentiment
    if positive_count > negative_count:
        return "positive"
    elif negative_count > positive_count:
        return "negative"
    else:
        return "neutral"

#### Process reviews and analyze

In [14]:
print("Analyzing reviews...")
results = []
for review in random.sample(reviews, 50):  # Analyze 50 random reviews for demo
    try:
        doc = nlp(review)
        
        # Extract entities (filter for products and brands)
        entities = [(ent.text, ent.label_) for ent in doc.ents 
                   if ent.label_ in ["PRODUCT", "BRAND", "ORG"]]
        
        # Analyze sentiment
        sentiment = analyze_sentiment(doc)
        
        # Store results
        results.append({
            "review": review,
            "entities": entities,
            "sentiment": sentiment
        })
    except Exception as e:
        print(f"Error processing review: {e}")
        continue

Analyzing reviews...


#### Display results

In [15]:
print("\nReview Analysis Results:\n")
for i, result in enumerate(results[:5], 1):  # Show first 5 for brevity
    print(f"Review {i}:")
    print(f"Text: {result['review'][:200]}...")  # Print first 200 chars
    print(f"Entities: {result['entities']}")
    print(f"Sentiment: {result['sentiment']}\n")


Review Analysis Results:

Review 1:
Text: Just another excuse to make money: I am sorry but after reading this book, I have to admit that the feminists are right. Men are pigs and the only reason this book was written was for the authors to m...
Entities: []
Sentiment: positive

Review 2:
Text: look, have any of you actually written a paper on this book?: well I have, and this book has got to be one of the stupidest things I have ever tried to write about. It is basically about nothing, and ...
Entities: [('Allende and Amado', 'ORG')]
Sentiment: positive

Review 3:
Text: Rubbish: I actually thought that Ellen would be funnier. This is just rambling nonsence. Just as though she has scribbled down the first thing that came to mind. Not for me, I'm affraid. I couldn't ge...
Entities: []
Sentiment: neutral

Review 4:
Text: AN ARTIST'S LIFE: IRVING STONE HAS HIS READERS RELIVE VINCENT VAN GOGH'S LIFE.AFTER READING THE BOOK YOU CAN EASILY APPRECIATE THE TUMULTOUS LIFE THIS ARTIST HAD.THE AU

#### Visualize NER for a sample review

In [None]:
sample_review = random.choice(reviews)
doc = nlp(sample_review)
print("\nSample Review NER Visualization:")
displacy.render(doc, style="ent", jupyter=True)


Sample Review NER Visualization:


#### Entity frequency analysis

In [None]:
entity_freq = defaultdict(int)
for result in results:
    for entity, label in result['entities']:
        entity_freq[(entity, label)] += 1

print("\nTop 10 Most Common Entities:")
for (entity, label), count in sorted(entity_freq.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{label}: {entity} - {count} occurrences")


Top 10 Most Common Entities:
ORG: Lennie - 4 occurrences
ORG: KoRn - 2 occurrences
BRAND: Amazon - 2 occurrences
ORG: American Idol - 2 occurrences
ORG: Allende and Amado - 1 occurrences
ORG: Surly Cross Check - 1 occurrences
ORG: Trek - 1 occurrences
ORG: WIN XP - 1 occurrences
ORG: Motorola Bluetooth - 1 occurrences
ORG: Warner Home Video - 1 occurrences


#### Sentiment distribution

In [None]:
sentiment_counts = defaultdict(int)
for result in results:
    sentiment_counts[result['sentiment']] += 1

print("\nSentiment Distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count} reviews")


Sentiment Distribution:
positive: 28 reviews
neutral: 15 reviews
negative: 7 reviews


#### Compare our sentiment analysis with ground truth

In [None]:
print("\nComparing with Ground Truth Sentiment:")
comparison_results = []
for result in results:
    # Find the review in our dataframe to get ground truth
    ground_truth = None
    for df in [train_sample, test_sample]:
        match = df[df['text'].str.contains(result['review'][:30], regex=False, na=False)]  # Match first 30 chars
        if not match.empty:
            ground_truth = match.iloc[0]['sentiment']
            break
    
    if ground_truth:
        comparison_results.append({
            "our_sentiment": result['sentiment'],
            "true_sentiment": ground_truth
        })


Comparing with Ground Truth Sentiment:


#### Calculate accuracy if we have comparisons

In [None]:
if comparison_results:
    correct = sum(1 for res in comparison_results 
                if res['our_sentiment'] == res['true_sentiment'])
    total = len(comparison_results)
    print(f"\nOur sentiment analysis accuracy: {correct/total:.2%} ({correct}/{total})")
    
    # Show some examples
    print("\nExample Comparisons:")
    for res in comparison_results[:3]:
        print(f"Predicted: {res['our_sentiment']} | Actual: {res['true_sentiment']}")
else:
    print("Could not match reviews with ground truth for comparison")


Our sentiment analysis accuracy: 48.00% (24/50)

Example Comparisons:
Predicted: positive | Actual: negative
Predicted: positive | Actual: negative
Predicted: neutral | Actual: negative
