# Week 4: Exercise Solutions - Unsupervised Learning Techniques

**Web and Social Network Analytics**

---

This notebook contains complete solutions for all exercises. **Try to solve them yourself first!**

## Setup

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
from itertools import combinations

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Machine Learning
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine

# Visualization
import matplotlib.pyplot as plt

print('All libraries imported successfully!')

---

## Exercise 1 Solution: Sentiment Analysis with VADER

In [None]:
# Step 1: Define the reviews
reviews = [
    "This product is absolutely AMAZING! Best purchase ever!!!",
    "Meh, it's okay. Nothing special.",
    "Terrible quality. Completely disappointed :(",
    "Pretty good value for the price, would recommend.",
    "DO NOT BUY! Worst experience of my life!!!"
]

# Step 2: Initialize VADER analyzer
analyzer = SentimentIntensityAnalyzer()

# Step 3: Define classification function
def classify_sentiment(compound):
    """Classify sentiment based on compound score.
    
    VADER recommends these thresholds:
    - Positive: compound > 0.05
    - Negative: compound < -0.05
    - Neutral: -0.05 <= compound <= 0.05
    """
    if compound > 0.05:
        return "Positive"
    elif compound < -0.05:
        return "Negative"
    else:
        return "Neutral"

# Step 4: Analyze each review
print("Sentiment Analysis Results")
print("-" * 70)
print(f"{'Review':<45} {'Compound':>10} {'Class':>12}")
print("-" * 70)

pos_count, neu_count, neg_count = 0, 0, 0

for review in reviews:
    # Get VADER scores
    scores = analyzer.polarity_scores(review)
    compound = scores['compound']
    
    # Classify
    classification = classify_sentiment(compound)
    
    # Count
    if classification == "Positive":
        pos_count += 1
    elif classification == "Negative":
        neg_count += 1
    else:
        neu_count += 1
    
    # Truncate long reviews for display
    display = review[:42] + "..." if len(review) > 45 else review
    print(f"{display:<45} {compound:>10.3f} {classification:>12}")

# Step 5: Print summary
print("-" * 70)
print(f"\nSummary: {pos_count} Positive, {neu_count} Neutral, {neg_count} Negative")

**Key Insights**:
- VADER correctly identifies strong positive ("AMAZING") and negative ("Terrible", "Worst") reviews
- Capitalization and punctuation (!!!) amplify sentiment scores
- Emoticons like `:(`  are recognized as negative

---

## Exercise 2 Solution: Jaccard Similarity Calculation

In [None]:
# Step 1: Define user purchases
users = {
    'Alice': {'iPhone', 'AirPods', 'MacBook', 'iPad'},
    'Bob': {'iPhone', 'AirPods', 'Galaxy Watch'},
    'Carol': {'MacBook', 'iPad', 'iMac'},
    'Dave': {'iPhone', 'AirPods', 'MacBook', 'iPad', 'iMac'}
}

# Step 2: Implement Jaccard similarity function
def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets.
    
    Formula: J(A,B) = |A ∩ B| / |A ∪ B|
    
    Returns a value between 0 (no overlap) and 1 (identical sets).
    """
    intersection = len(set1 & set2)  # & is set intersection
    union = len(set1 | set2)         # | is set union
    return intersection / union if union > 0 else 0

# Step 3: Calculate similarity for all pairs
print("Jaccard Similarities:")
print("-" * 50)
print(f"{'Pair':<20} {'Intersection':<15} {'Union':<10} {'Jaccard':>8}")
print("-" * 50)

similarities = {}
user_names = list(users.keys())

for user1, user2 in combinations(user_names, 2):
    set1, set2 = users[user1], users[user2]
    intersection = set1 & set2
    union = set1 | set2
    sim = jaccard_similarity(set1, set2)
    similarities[(user1, user2)] = sim
    
    print(f"{user1}-{user2:<14} {str(intersection):<15} {len(union):<10} {sim:>8.3f}")

# Step 4: Find most and least similar pairs
most_similar = max(similarities, key=similarities.get)
least_similar = min(similarities, key=similarities.get)

print("\n" + "="*50)
print(f"Most similar pair: {most_similar[0]} & {most_similar[1]}")
print(f"  Jaccard similarity: {similarities[most_similar]:.3f}")
print(f"  Common products: {users[most_similar[0]] & users[most_similar[1]]}")

print(f"\nLeast similar pair: {least_similar[0]} & {least_similar[1]}")
print(f"  Jaccard similarity: {similarities[least_similar]:.3f}")
print(f"  Common products: {users[least_similar[0]] & users[least_similar[1]]}")

**Key Insights**:
- Alice and Dave are most similar (4 products in common out of 5 total = 0.8)
- Bob and Carol are least similar (only 0 products in common = 0.0)
- Jaccard considers both what users bought AND what they didn't buy

---

## Exercise 3 Solution: Support, Confidence, and Lift Calculation

In [None]:
# Step 1: Define transactions
transactions = [
    ['bread', 'milk', 'eggs'],
    ['bread', 'butter'],
    ['milk', 'eggs', 'butter'],
    ['bread', 'milk', 'eggs', 'butter'],
    ['bread', 'milk'],
    ['eggs', 'butter'],
    ['bread', 'milk', 'butter'],
    ['bread', 'eggs']
]

print(f"Total transactions: {len(transactions)}")
for i, t in enumerate(transactions, 1):
    print(f"  T{i}: {t}")

In [None]:
# Step 2: Implement support function
def support(itemset, transactions):
    """Calculate the support of an itemset.
    
    Support = (# transactions containing itemset) / (total # transactions)
    """
    # Handle single item as string
    if isinstance(itemset, str):
        itemset = [itemset]
    
    count = 0
    for trans in transactions:
        # Check if all items in itemset are in the transaction
        if set(itemset).issubset(set(trans)):
            count += 1
    return count / len(transactions)

# Step 3: Calculate individual item supports
print("Individual Item Support:")
print("-" * 40)
items = ['bread', 'milk', 'eggs', 'butter']
for item in items:
    sup = support(item, transactions)
    count = int(sup * len(transactions))
    print(f"  support({item}) = {count}/{len(transactions)} = {sup:.3f}")

In [None]:
# Step 4: Calculate support for {bread, milk}
print("\nPair Support:")
print("-" * 40)
sup_bread_milk = support(['bread', 'milk'], transactions)
print(f"  support({{bread, milk}}) = {sup_bread_milk:.3f}")
print(f"  Appears in {int(sup_bread_milk * len(transactions))} out of {len(transactions)} transactions")

In [None]:
# Step 5: Calculate confidence values
print("\nConfidence:")
print("-" * 40)

# Confidence(bread -> milk) = support({bread, milk}) / support({bread})
conf_bread_milk = support(['bread', 'milk'], transactions) / support('bread', transactions)
print(f"  confidence(bread -> milk) = {sup_bread_milk:.3f} / {support('bread', transactions):.3f} = {conf_bread_milk:.3f}")
print(f"  Interpretation: {conf_bread_milk*100:.0f}% of customers who buy bread also buy milk")

# Confidence(milk -> bread) = support({bread, milk}) / support({milk})
conf_milk_bread = support(['bread', 'milk'], transactions) / support('milk', transactions)
print(f"\n  confidence(milk -> bread) = {sup_bread_milk:.3f} / {support('milk', transactions):.3f} = {conf_milk_bread:.3f}")
print(f"  Interpretation: {conf_milk_bread*100:.0f}% of customers who buy milk also buy bread")

In [None]:
# Step 6: Calculate lift
print("\nLift:")
print("-" * 40)

# Lift(bread -> milk) = support({bread, milk}) / (support({bread}) * support({milk}))
sup_bread = support('bread', transactions)
sup_milk = support('milk', transactions)
lift_bread_milk = sup_bread_milk / (sup_bread * sup_milk)

print(f"  lift(bread -> milk) = {sup_bread_milk:.3f} / ({sup_bread:.3f} * {sup_milk:.3f})")
print(f"                      = {sup_bread_milk:.3f} / {sup_bread * sup_milk:.3f}")
print(f"                      = {lift_bread_milk:.3f}")

# Step 7: Interpret results
print("\n" + "="*40)
print("Interpretation:")
print("="*40)
if lift_bread_milk > 1:
    print(f"  Lift = {lift_bread_milk:.3f} > 1")
    print(f"  Bread and milk are POSITIVELY ASSOCIATED")
    print(f"  Customers who buy bread are {lift_bread_milk:.1f}x more likely to buy milk")
    print(f"  than expected if purchases were independent.")
elif lift_bread_milk < 1:
    print(f"  Lift = {lift_bread_milk:.3f} < 1")
    print(f"  Bread and milk are SUBSTITUTES")
else:
    print(f"  Lift = {lift_bread_milk:.3f} = 1")
    print(f"  Bread and milk are INDEPENDENT")

---

## Exercise 4 Solution: A-Priori Algorithm Implementation

In [None]:
# Step 1: Implement mingle function
def mingle(items, level):
    """Generate candidate itemsets of size 'level' from items.
    
    For level 2: combines single items into pairs
    For level 3+: combines itemsets into larger sets
    
    Returns frozensets to allow adding to a set.
    """
    outcome = set()
    
    for item in items:
        for item2 in items:
            if item != item2:
                new_combination = set()
                
                if level > 2:
                    # Combine existing itemsets (which are iterable)
                    for i in item:
                        new_combination.add(i)
                    for i in item2:
                        new_combination.add(i)
                else:
                    # Combine single items
                    new_combination.add(item)
                    new_combination.add(item2)
                
                # Only keep if it's the right size
                if len(new_combination) == level:
                    outcome.add(frozenset(new_combination))
    
    return outcome

# Test mingle
assert mingle(["a","b","c"], 2) == {frozenset({'a', 'c'}), 
                                     frozenset({'b', 'c'}), 
                                     frozenset({'a', 'b'})}
print("mingle() test passed!")

In [None]:
# Step 2: Implement support function for levels
def support_level(itemset, transactions, level):
    """Calculate support of an itemset at a given level.
    
    Level 1: itemset is a single item (string)
    Level 2+: itemset is a collection of items
    """
    count = 0
    
    for trans in transactions:
        contain = True
        
        if level > 1:
            # Check each item in the itemset
            for item in itemset:
                if item not in trans:
                    contain = False
                    break
        else:
            # Single item check
            if itemset not in trans:
                contain = False
        
        if contain:
            count += 1
    
    return count / len(transactions)

# Test support
test_trans = [["a","b","c"], ["a","b","d"], ["b","c"], ["a","c"]]
assert support_level("a", test_trans, 1) == 0.75
assert support_level(["a","b"], test_trans, 2) == 0.5
print("support() tests passed!")

In [None]:
# Step 3: Implement apriori function
def apriori(level, transactions, items, minsup):
    """A-Priori algorithm implementation.
    
    1. Calculate support for all items at current level
    2. Keep only items meeting minimum support
    3. Generate candidates for next level
    4. Recurse until no more candidates
    """
    print(f"\n{'='*50}")
    print(f"Level {level}")
    print(f"{'='*50}")
    
    retain = set()
    
    # Find items meeting minimum support
    for item in items:
        sup = support_level(item, transactions, level)
        status = "KEEP" if sup >= minsup else "DROP"
        print(f"  {str(item):25} support: {sup:.2f}  [{status}]")
        
        if sup >= minsup:
            retain.add(item)
    
    print(f"\nRetained: {retain}")
    
    # Move to next level
    level += 1
    newsets = mingle(retain, level)
    print(f"New candidates for level {level}: {newsets}")
    
    # Recurse if there are new candidates
    if len(newsets) != 0 and level < len(items) + 1:
        apriori(level, transactions, newsets, minsup)

In [None]:
# Step 4: Load baskets data and run A-Priori
file = open('data/baskets.csv', 'r')

transactions = []
items = set()

for line in file:
    line = line.replace('\n', '')
    litems = line.split(',')
    transactions.append(litems)
    for item in litems:
        items.add(item)

file.close()

print(f"Loaded {len(transactions)} transactions")
print(f"Unique items: {items}")
print("\nTransactions:")
for i, t in enumerate(transactions, 1):
    print(f"  T{i}: {t}")

# Run A-Priori with minSup = 60%
print("\n" + "#"*50)
print("A-PRIORI ALGORITHM (minSup = 60%)")
print("#"*50)
apriori(1, transactions, items, 0.6)

**Key Insights**:
- The A-Priori algorithm uses the "anti-monotone" property: if an itemset doesn't meet minSup, none of its supersets will
- This allows efficient pruning of the search space
- The algorithm terminates when no new candidates can be generated

---

## Exercise 5 Solution: Collaborative Filtering Recommendation

In [None]:
# Step 1: Load ratings data
ratings = pd.read_csv('data/ratings.csv')
ratings = ratings[:5000]  # Sample for speed

noMovies = len(ratings['movieId'].unique())
noUsers = len(ratings['userId'].unique())

print(f"Dataset: {noMovies} movies rated by {noUsers} users")
print(f"Total ratings: {len(ratings)}")
print(ratings.head())

In [None]:
# Step 2: Create utility matrix
utility = np.zeros(shape=(noUsers, noMovies))

# Map movie IDs to sequential indices
# (Movie IDs are not sequential, so we need a mapping)
movieIds = {}
for i, mid in enumerate(ratings['movieId'].unique()):
    movieIds[mid] = i

# Reverse mapping for later use
movieIds_reverse = {v: k for k, v in movieIds.items()}

# Populate the matrix
for _, row in ratings.iterrows():
    uid = int(row['userId']) - 1  # User IDs are 1-indexed
    mid = movieIds[row['movieId']]
    utility[uid, mid] = row['rating']

print(f"Utility matrix shape: {utility.shape}")
print(f"Non-zero entries: {(utility != 0).sum()}")
print(f"Sparsity: {(utility == 0).sum() / utility.size * 100:.1f}% empty")

In [None]:
# Step 3: Implement findSimilarUsers
def findSimilarUsers(person_number, utility_matrix, minCos=0.5):
    """Find users similar to the given user using cosine similarity.
    
    Args:
        person_number: Index of the target user
        utility_matrix: User-item rating matrix
        minCos: Minimum similarity threshold
    
    Returns:
        List of (user_id, similarity) tuples, sorted by similarity
    """
    similar_users = []
    target_ratings = utility_matrix[person_number]
    
    for other in range(len(utility_matrix)):
        if person_number != other:
            other_ratings = utility_matrix[other]
            
            # Skip if either user has no ratings (avoid division by zero)
            if np.any(target_ratings) and np.any(other_ratings):
                # scipy.cosine returns DISTANCE, so similarity = 1 - distance
                cos_sim = 1 - cosine(target_ratings, other_ratings)
                
                if cos_sim > minCos:
                    similar_users.append((other, cos_sim))
    
    # Sort by similarity (highest first)
    return sorted(similar_users, key=lambda x: x[1], reverse=True)

# Test with user 0
similar = findSimilarUsers(0, utility, minCos=0.3)
print(f"Found {len(similar)} similar users for User 0")
print("\nTop 5 similar users:")
for uid, sim in similar[:5]:
    print(f"  User {uid}: similarity = {sim:.3f}")

In [None]:
# Step 4: Implement findNewProducts
def findNewProducts(similar_users, person_number, utility_matrix, minScore=2.0):
    """Recommend movies based on similar users' ratings.
    
    For each movie the target user hasn't rated:
    - Calculate average rating from similar users who rated it
    - Recommend if average exceeds minScore threshold
    
    Returns:
        List of (movie_index, predicted_score) tuples, sorted by score
    """
    recommendations = []
    
    for movie in range(utility_matrix.shape[1]):
        # Only consider movies the user hasn't rated
        if utility_matrix[person_number, movie] == 0:
            scores = []
            
            # Collect ratings from similar users
            for user_id, sim in similar_users:
                rating = utility_matrix[user_id, movie]
                if rating > 0:  # Similar user has rated this movie
                    scores.append(rating)
            
            # Calculate average if we have ratings
            if scores:
                avg_score = sum(scores) / len(scores)
                if avg_score > minScore:
                    recommendations.append((movie, avg_score))
    
    # Sort by predicted score (highest first)
    return sorted(recommendations, key=lambda x: x[1], reverse=True)

# Generate recommendations for user 0
recs = findNewProducts(similar, 0, utility, minScore=3.5)

print(f"\nTop 10 recommendations for User 0 (minScore=3.5):")
print("-" * 50)
print(f"{'Movie Index':<15} {'Predicted Score':>15}")
print("-" * 50)
for movie_idx, score in recs[:10]:
    print(f"{movie_idx:<15} {score:>15.2f}")

In [None]:
# Bonus: Run for multiple users
print("\nRecommendation Summary:")
print("="*50)

for user_id in range(min(5, noUsers)):
    similar = findSimilarUsers(user_id, utility, minCos=0.3)
    recs = findNewProducts(similar, user_id, utility, minScore=3.5)
    
    print(f"User {user_id}: {len(similar)} similar users, {len(recs)} recommendations")
    if recs:
        top_movie, top_score = recs[0]
        print(f"  Top recommendation: Movie {top_movie} (score: {top_score:.2f})")

**Key Insights**:
- User-based collaborative filtering finds users with similar taste
- The utility matrix is very sparse (most users haven't rated most movies)
- Cosine similarity measures angle between rating vectors, ignoring magnitude
- Higher similarity thresholds give fewer but more reliable recommendations

---

## Bonus Solution: K-Means Clustering on Starbucks Data

In [None]:
# Step 1: Load and filter Starbucks data
data = pd.read_csv("data/starbucks_locations.csv", index_col=0)
data = data.dropna()

# Filter to Middle East region for faster processing
filtered = data[(data["Latitude"].between(24, 27)) & 
                (data["Longitude"].between(49, 56))]

print(f"Total locations: {len(data)}")
print(f"Filtered locations: {len(filtered)}")

In [None]:
# Step 2: Try different values of K
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

k_values = [3, 5, 10]
inertias = []

for i, k in enumerate(k_values):
    kmeans = KMeans(n_clusters=k, max_iter=500, random_state=42)
    kmeans.fit(filtered)
    inertias.append(kmeans.inertia_)
    
    ax = axes[i]
    ax.scatter(filtered['Longitude'], filtered['Latitude'], 
               c=kmeans.labels_, cmap='tab10', s=50, alpha=0.7)
    ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
               c='red', marker='X', s=200, label='Centroids')
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title(f'K-Means (K={k})')
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Step 3: Elbow method to find optimal K
k_range = range(2, 15)
inertias = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, max_iter=500, random_state=42)
    kmeans.fit(filtered)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Within-cluster sum of squares)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

print("\nDiscussion: How to choose optimal K?")
print("="*50)
print("1. Elbow Method: Look for the 'elbow' where adding more")
print("   clusters doesn't significantly reduce inertia.")
print("2. Silhouette Score: Measures how similar points are to")
print("   their own cluster vs other clusters.")
print("3. Domain Knowledge: How many meaningful groups exist?")
print("4. Business Requirements: What's practical to manage?")

---

## Summary

### Exercise 1: Sentiment Analysis
- VADER is effective for social media text
- Compound score ranges from -1 (negative) to +1 (positive)
- Standard thresholds: > 0.05 positive, < -0.05 negative

### Exercise 2: Jaccard Similarity
- Measures overlap between sets
- J(A,B) = |A intersection B| / |A union B|
- Range: 0 (no overlap) to 1 (identical)

### Exercise 3: Association Rules
- Support: How often does the itemset appear?
- Confidence: How often is the rule correct?
- Lift > 1 means positive association

### Exercise 4: A-Priori Algorithm
- Efficiently finds frequent itemsets
- Uses anti-monotone property for pruning
- minSup controls the threshold

### Exercise 5: Collaborative Filtering
- Finds similar users based on rating patterns
- Recommends items that similar users liked
- Cosine similarity is common for rating vectors