# üìä Social Media ROI Attribution - Data Generation

This notebook generates synthetic, unbiased data for the Social Media ROI Attribution & Influencer Performance Analyzer project.

**Datasets Generated:**
- 1,500 Influencers
- 50,000 Posts
- 25 Brands
- 30,000 Conversions
- 100,000 Touchpoints

## 1. Setup & Dependencies

In [None]:
# Install required packages
!pip install pandas numpy scipy faker matplotlib seaborn -q

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from uuid import uuid4
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("‚úÖ Libraries loaded successfully!")

## 2. Configuration & Industry Benchmarks

In [None]:
# ============================================
# CONFIGURATION - Industry Benchmarks
# Sources: Sprout Social, Later, HubSpot, Influencer Marketing Hub
# ============================================

# Dataset sizes
N_INFLUENCERS = 1500
N_POSTS = 50000
N_BRANDS = 25
N_CONVERSIONS = 30000
N_TOUCHPOINTS = 100000

# Date range
DATE_START = "2024-02-01"
DATE_END = "2025-01-31"

# Platform distribution (fashion industry)
PLATFORM_DIST = {"Instagram": 0.45, "TikTok": 0.35, "YouTube": 0.12, "Twitter": 0.08}

# Influencer tier distribution (industry standard)
TIER_DIST = {"nano": 0.40, "micro": 0.35, "mid": 0.15, "macro": 0.07, "mega": 0.03}

# Follower ranges by tier
TIER_FOLLOWERS = {
    "nano": (1000, 10000), "micro": (10000, 100000), 
    "mid": (100000, 500000), "macro": (500000, 1000000), "mega": (1000000, 10000000)
}

# Engagement rates by tier (mean, std) - inversely correlated with followers
TIER_ENGAGEMENT = {
    "nano": (6.0, 1.5), "micro": (3.5, 1.0), 
    "mid": (2.2, 0.6), "macro": (1.5, 0.4), "mega": (1.0, 0.3)
}

# Audience authenticity by tier
TIER_AUTHENTICITY = {
    "nano": (0.92, 0.05), "micro": (0.88, 0.06), 
    "mid": (0.82, 0.08), "macro": (0.75, 0.10), "mega": (0.70, 0.12)
}

# Cost per post by tier (USD)
TIER_COST = {
    "nano": (50, 150), "micro": (150, 1000), 
    "mid": (1000, 5000), "macro": (5000, 15000), "mega": (15000, 100000)
}

# Gender distribution (balanced)
GENDER_DIST = {"Female": 0.48, "Male": 0.45, "Non-binary": 0.05, "Unknown": 0.02}

# Geographic distribution (avoid US-centric bias)
COUNTRY_DIST = {
    "United States": 0.30, "United Kingdom": 0.12, "Germany": 0.08, "France": 0.07,
    "Italy": 0.05, "Spain": 0.05, "Australia": 0.05, "Canada": 0.05,
    "Japan": 0.04, "South Korea": 0.04, "Brazil": 0.04, "India": 0.04,
    "Mexico": 0.03, "Netherlands": 0.02, "Sweden": 0.02
}

# Age groups
AGE_DIST = {"18-24": 0.35, "25-34": 0.40, "35-44": 0.18, "45+": 0.07}

# Content categories
CONTENT_CATEGORIES = ["Luxury Fashion", "Streetwear", "Sustainable Fashion", "Fast Fashion", 
                      "Accessories", "Footwear", "Activewear", "Vintage/Thrift"]

# Visual styles
VISUAL_STYLES = {"lifestyle": 0.35, "product_shot": 0.30, "behind_scenes": 0.15, 
                 "user_generated": 0.12, "editorial": 0.08}

# Brand tiers  
BRAND_TIERS = {"Luxury": 0.20, "Premium": 0.25, "Mid-market": 0.30, "Fast-fashion": 0.15, "DTC": 0.10}

# Average order value by brand tier
BRAND_AOV = {
    "Luxury": (500, 2000), "Premium": (150, 500), "Mid-market": (50, 150),
    "Fast-fashion": (25, 75), "DTC": (75, 200)
}

# Seasonality (fashion: Q4 peak, summer dip)
SEASONALITY = {
    1: 0.85, 2: 0.90, 3: 0.95, 4: 1.00, 5: 0.95, 6: 0.85,
    7: 0.80, 8: 0.90, 9: 1.05, 10: 1.10, 11: 1.20, 12: 1.25
}

# Dominant colors
COLORS = ["neutral_beige", "cream_white", "classic_black", "navy_blue", "olive_green",
          "terracotta", "dusty_rose", "burgundy", "camel_brown", "sage_green"]

print("‚úÖ Configuration loaded!")

## 3. Helper Functions

In [None]:
def sample_dist(dist, n=1):
    """Sample from a categorical distribution."""
    return np.random.choice(list(dist.keys()), size=n, p=list(dist.values()))

def gen_followers(tier):
    """Generate follower count (log-normal within tier)."""
    low, high = TIER_FOLLOWERS[tier]
    log_low, log_high = np.log(low), np.log(high)
    return int(np.exp(np.random.uniform(log_low, log_high)))

def gen_engagement(tier):
    """Generate engagement rate based on tier."""
    mean, std = TIER_ENGAGEMENT[tier]
    return np.clip(np.random.normal(mean, std), 0.5, 12.0)

def gen_authenticity(tier):
    """Generate authenticity score based on tier."""
    mean, std = TIER_AUTHENTICITY[tier]
    return np.clip(np.random.normal(mean, std), 0.4, 0.99)

def gen_cost(tier, followers):
    """Generate cost per post."""
    low, high = TIER_COST[tier]
    tier_range = TIER_FOLLOWERS[tier]
    position = (followers - tier_range[0]) / (tier_range[1] - tier_range[0])
    base_cost = low + position * (high - low)
    return round(base_cost * np.random.uniform(0.8, 1.2), 2)

def gen_engagement_metrics(followers, eng_rate, is_viral=False):
    """Generate likes, comments, shares, saves."""
    variance = np.random.uniform(0.7, 1.3)
    if is_viral:
        variance *= np.random.uniform(3, 10)
    
    total = int(followers * (eng_rate / 100) * variance)
    likes = int(total * np.random.uniform(0.85, 0.92))
    comments = int(likes * np.random.uniform(0.03, 0.08))
    shares = int(likes * np.random.uniform(0.01, 0.025))
    saves = int(likes * np.random.uniform(0.02, 0.05))
    
    return max(1, likes), max(0, comments), max(0, shares), max(0, saves)

def gen_order_value(brand_tier):
    """Generate order value (log-normal)."""
    low, high = BRAND_AOV[brand_tier]
    log_low, log_high = np.log(low), np.log(high)
    value = np.exp(np.random.normal((log_low + log_high) / 2, (log_high - log_low) / 4))
    return round(np.clip(value, low * 0.5, high * 1.5), 2)

print("‚úÖ Helper functions defined!")

## 4. Generate Datasets

### 4.1 Generate Brands

In [None]:
print("üè¢ Generating Brands...")

brand_prefixes = ["Maison", "Atelier", "Casa", "Studio", "House of", "La", "Le", "The", "Modern", "Luxe"]
brand_suffixes = ["Mode", "Style", "Vogue", "Chic", "Edit", "Label", "Collective", "Co", "Design", "Wear"]

brands = []
tiers = sample_dist(BRAND_TIERS, N_BRANDS)

for i in range(N_BRANDS):
    tier = tiers[i]
    budget_ranges = {
        "Luxury": (200000, 500000), "Premium": (100000, 250000), 
        "Mid-market": (50000, 150000), "Fast-fashion": (75000, 200000), "DTC": (25000, 100000)
    }
    low, high = budget_ranges[tier]
    
    brands.append({
        "brand_id": str(uuid4()),
        "brand_name": f"{np.random.choice(brand_prefixes)} {np.random.choice(brand_suffixes)}",
        "brand_tier": tier,
        "monthly_social_budget": round(np.random.uniform(low, high), 2),
        "primary_platform": sample_dist(PLATFORM_DIST)[0],
        "avg_product_price": gen_order_value(tier),
        "target_demographic": np.random.choice(["18-24", "25-34", "35-44", "25-44"]),
        "founded_year": np.random.randint(1990, 2022)
    })

brands_df = pd.DataFrame(brands)
print(f"‚úÖ Generated {len(brands_df)} brands")
brands_df.head()

### 4.2 Generate Influencers

In [None]:
print("üë§ Generating Influencers...")

influencers = []
tiers = sample_dist(TIER_DIST, N_INFLUENCERS)
platforms = sample_dist(PLATFORM_DIST, N_INFLUENCERS)
countries = sample_dist(COUNTRY_DIST, N_INFLUENCERS)
genders = sample_dist(GENDER_DIST, N_INFLUENCERS)
ages = sample_dist(AGE_DIST, N_INFLUENCERS)

for i in range(N_INFLUENCERS):
    tier = tiers[i]
    followers = gen_followers(tier)
    
    influencers.append({
        "influencer_id": str(uuid4()),
        "username": f"creator_{i+1:05d}",
        "platform": platforms[i],
        "tier": tier,
        "follower_count": followers,
        "engagement_rate": round(gen_engagement(tier), 2),
        "country": countries[i],
        "content_category": np.random.choice(CONTENT_CATEGORIES),
        "avg_post_frequency": round(np.clip(np.random.normal(4.2, 1.5), 1, 10), 1),
        "audience_authenticity_score": round(gen_authenticity(tier), 2),
        "avg_collaboration_cost": gen_cost(tier, followers),
        "account_age_months": np.random.randint(12, 96),
        "gender": genders[i],
        "age_group": ages[i],
        "verified": np.random.random() < (0.1 if tier in ["nano", "micro"] else 0.5),
        "active": np.random.random() < 0.95
    })

influencers_df = pd.DataFrame(influencers)
print(f"‚úÖ Generated {len(influencers_df)} influencers")
influencers_df.head()

### 4.3 Generate Posts

In [None]:
print("üì± Generating Posts...")

# Date range
start_date = datetime.strptime(DATE_START, "%Y-%m-%d")
end_date = datetime.strptime(DATE_END, "%Y-%m-%d")
date_range = (end_date - start_date).days

# Lookups
inf_ids = influencers_df["influencer_id"].tolist()
brand_ids = brands_df["brand_id"].tolist()
inf_lookup = influencers_df.set_index("influencer_id").to_dict("index")

# Content types by platform
CONTENT_TYPES = {
    "Instagram": {"photo": 0.35, "carousel": 0.25, "reel": 0.30, "story": 0.10},
    "TikTok": {"video": 0.95, "photo": 0.05},
    "YouTube": {"video": 0.85, "shorts": 0.15},
    "Twitter": {"photo": 0.50, "video": 0.25, "text": 0.25}
}

posts = []
for i in range(N_POSTS):
    inf_id = np.random.choice(inf_ids)
    inf = inf_lookup[inf_id]
    platform = inf["platform"]
    
    # Date with seasonality
    random_days = np.random.randint(0, date_range)
    post_date = start_date + timedelta(days=random_days)
    month = post_date.month
    
    # Content type
    content_types = CONTENT_TYPES.get(platform, {"photo": 1.0})
    content_type = sample_dist(content_types)[0]
    
    # Sponsored?
    is_sponsored = np.random.random() < (0.25 if inf["tier"] in ["mid", "macro", "mega"] else 0.10)
    
    # Engagement with seasonality
    is_viral = np.random.random() < 0.05
    eng_rate = inf["engagement_rate"] * SEASONALITY[month]
    likes, comments, shares, saves = gen_engagement_metrics(inf["follower_count"], eng_rate, is_viral)
    
    # Reach & impressions
    reach = int(inf["follower_count"] * np.random.uniform(0.20, 0.40))
    impressions = int(reach * np.random.uniform(1.2, 1.8))
    
    posts.append({
        "post_id": str(uuid4()),
        "influencer_id": inf_id,
        "brand_id": np.random.choice(brand_ids) if is_sponsored else None,
        "platform": platform,
        "post_date": post_date.strftime("%Y-%m-%d"),
        "post_time_hour": np.random.choice(range(6, 24), p=[0.02, 0.03, 0.05, 0.07, 0.08, 0.10, 0.12, 0.10, 0.07, 0.06, 0.05, 0.04, 0.06, 0.08, 0.10, 0.08, 0.05, 0.03]),
        "day_of_week": np.random.choice(range(7), p=[0.12, 0.16, 0.17, 0.16, 0.14, 0.13, 0.12]),
        "content_type": content_type,
        "caption_length": int(np.clip(np.random.normal(180, 80), 20, 500)),
        "hashtag_count": int(np.clip(np.random.normal(8 if platform == "Instagram" else 4, 3), 1, 30)),
        "has_cta": np.random.random() < 0.45,
        "product_count": np.random.poisson(2) if is_sponsored else 0,
        "visual_style": sample_dist(VISUAL_STYLES)[0],
        "dominant_color": np.random.choice(COLORS),
        "is_sponsored": is_sponsored,
        "discount_code_present": is_sponsored and np.random.random() < 0.30,
        "likes": likes,
        "comments": comments,
        "shares": shares,
        "saves": saves,
        "reach": reach,
        "impressions": impressions
    })
    
    if (i + 1) % 10000 == 0:
        print(f"   ... {i+1:,} posts generated")

posts_df = pd.DataFrame(posts)
print(f"‚úÖ Generated {len(posts_df)} posts")
posts_df.head()

### 4.4 Generate Conversions

In [None]:
print("üõí Generating Conversions...")

sponsored_posts = posts_df[posts_df["is_sponsored"] == True].copy()
brand_lookup = brands_df.set_index("brand_id").to_dict("index")
product_categories = ["Clothing", "Accessories", "Footwear", "Bags", "Jewelry"]

conversions = []
for i in range(N_CONVERSIONS):
    has_attribution = np.random.random() < 0.65
    
    if has_attribution and len(sponsored_posts) > 0:
        post = sponsored_posts.sample(1).iloc[0]
        post_id = post["post_id"]
        influencer_id = post["influencer_id"]
        brand_id = post["brand_id"]
        post_date = datetime.strptime(post["post_date"], "%Y-%m-%d")
        journey_length = int(np.clip(np.random.exponential(7), 1, 90))
        conversion_date = min(post_date + timedelta(days=journey_length), end_date)
    else:
        post_id = None
        influencer_id = None
        brand_id = np.random.choice(brand_ids)
        random_days = np.random.randint(0, date_range)
        conversion_date = start_date + timedelta(days=random_days)
        journey_length = int(np.clip(np.random.exponential(7), 1, 90))
    
    brand_tier = brand_lookup[brand_id]["brand_tier"] if brand_id else "Mid-market"
    
    conversions.append({
        "conversion_id": str(uuid4()),
        "customer_id": str(uuid4()),
        "post_id": post_id,
        "influencer_id": influencer_id,
        "brand_id": brand_id,
        "conversion_date": conversion_date.strftime("%Y-%m-%d"),
        "attribution_type": np.random.choice(["first_touch", "last_touch", "linear", "time_decay", "position_based"], p=[0.15, 0.25, 0.20, 0.25, 0.15]),
        "utm_source": np.random.choice(["instagram", "tiktok", "youtube", "twitter", "direct", "organic"]),
        "utm_medium": np.random.choice(["social", "influencer", "organic", "paid"]),
        "order_value": gen_order_value(brand_tier),
        "product_category": np.random.choice(product_categories),
        "discount_code_used": post_id is not None and np.random.random() < 0.40,
        "customer_journey_length": journey_length,
        "touchpoints_count": int(np.clip(np.random.geometric(0.3), 1, 15))
    })
    
    if (i + 1) % 10000 == 0:
        print(f"   ... {i+1:,} conversions generated")

conversions_df = pd.DataFrame(conversions)
print(f"‚úÖ Generated {len(conversions_df)} conversions")
conversions_df.head()

### 4.5 Generate Touchpoints

In [None]:
print("üîó Generating Touchpoints...")

conversions_with_posts = conversions_df[conversions_df["post_id"].notna()].copy()
touchpoint_types = ["view", "click", "save", "like", "comment", "website_visit", "add_to_cart"]
post_ids = posts_df["post_id"].tolist()

touchpoints = []
for i in range(N_TOUCHPOINTS):
    leads_to_conversion = np.random.random() < 0.30
    
    if leads_to_conversion and len(conversions_with_posts) > 0:
        conv = conversions_with_posts.sample(1).iloc[0]
        conversion_id = conv["conversion_id"]
        customer_id = conv["customer_id"]
        post_id = conv["post_id"]
        conv_date = datetime.strptime(conv["conversion_date"], "%Y-%m-%d")
        days_before = np.random.randint(0, max(1, conv["customer_journey_length"]))
        touchpoint_date = conv_date - timedelta(days=days_before)
    else:
        conversion_id = None
        customer_id = str(uuid4())
        post_id = np.random.choice(post_ids) if np.random.random() < 0.7 else None
        random_days = np.random.randint(0, date_range)
        touchpoint_date = start_date + timedelta(days=random_days)
    
    platform = np.random.choice(["Instagram", "TikTok", "YouTube", "Twitter", "Website"])
    
    touchpoints.append({
        "touchpoint_id": str(uuid4()),
        "customer_id": customer_id,
        "post_id": post_id,
        "touchpoint_type": np.random.choice(touchpoint_types, p=[0.35, 0.20, 0.10, 0.15, 0.05, 0.10, 0.05]),
        "touchpoint_date": touchpoint_date.strftime("%Y-%m-%d"),
        "platform": platform,
        "contributed_to_conversion": leads_to_conversion,
        "conversion_id": conversion_id,
        "attribution_weight": round(np.random.uniform(0.05, 0.40), 3) if leads_to_conversion else 0.0
    })
    
    if (i + 1) % 25000 == 0:
        print(f"   ... {i+1:,} touchpoints generated")

touchpoints_df = pd.DataFrame(touchpoints)
print(f"‚úÖ Generated {len(touchpoints_df)} touchpoints")
touchpoints_df.head()

## 5. Save Datasets

In [None]:
# Create data directory
data_dir = Path("../data/raw")
data_dir.mkdir(parents=True, exist_ok=True)

# Save all datasets
brands_df.to_csv(data_dir / "brands.csv", index=False)
influencers_df.to_csv(data_dir / "influencers.csv", index=False)
posts_df.to_csv(data_dir / "posts.csv", index=False)
conversions_df.to_csv(data_dir / "conversions.csv", index=False)
touchpoints_df.to_csv(data_dir / "touchpoints.csv", index=False)

print("üíæ Datasets saved to ../data/raw/")
print(f"\nüìä Dataset Summary:")
print(f"   - brands.csv: {len(brands_df):,} records")
print(f"   - influencers.csv: {len(influencers_df):,} records")
print(f"   - posts.csv: {len(posts_df):,} records")
print(f"   - conversions.csv: {len(conversions_df):,} records")
print(f"   - touchpoints.csv: {len(touchpoints_df):,} records")

## 6. Data Validation & Bias Checks

In [None]:
print("‚öñÔ∏è BIAS ANALYSIS")
print("=" * 50)

# Gender distribution
print("\nüîπ Gender Distribution:")
gender_dist = influencers_df["gender"].value_counts(normalize=True)
for gender, pct in gender_dist.items():
    expected = GENDER_DIST.get(gender, 0)
    diff = abs(pct - expected)
    status = "‚úÖ" if diff < 0.05 else "‚ö†Ô∏è"
    print(f"   {status} {gender}: {pct:.1%} (expected: {expected:.1%})")

# Geographic distribution
print("\nüîπ Geographic Distribution:")
country_dist = influencers_df["country"].value_counts(normalize=True)
us_share = country_dist.get("United States", 0)
status = "‚úÖ" if us_share < 0.35 else "‚ö†Ô∏è"
print(f"   {status} US representation: {us_share:.1%} (target: <35%)")
print(f"   Top 5: {dict(country_dist.head(5))}")

# Tier distribution
print("\nüîπ Influencer Tier Distribution:")
tier_dist = influencers_df["tier"].value_counts(normalize=True)
for tier, pct in tier_dist.items():
    expected = TIER_DIST.get(tier, 0)
    diff = abs(pct - expected)
    status = "‚úÖ" if diff < 0.05 else "‚ö†Ô∏è"
    print(f"   {status} {tier}: {pct:.1%} (expected: {expected:.1%})")

In [None]:
print("\nüîó CORRELATION VALIDATION")
print("=" * 50)

# Followers vs Engagement (should be NEGATIVE)
corr = influencers_df["follower_count"].corr(influencers_df["engagement_rate"])
status = "‚úÖ" if corr < 0 else "‚ùå"
print(f"\n   {status} Followers ‚Üî Engagement: {corr:.3f} (expected: negative)")

# Followers vs Cost (should be POSITIVE)
corr = influencers_df["follower_count"].corr(influencers_df["avg_collaboration_cost"])
status = "‚úÖ" if corr > 0 else "‚ùå"
print(f"   {status} Followers ‚Üî Cost: {corr:.3f} (expected: positive)")

# Likes vs Comments (should be POSITIVE)
corr = posts_df["likes"].corr(posts_df["comments"])
status = "‚úÖ" if corr > 0.5 else "‚ö†Ô∏è"
print(f"   {status} Likes ‚Üî Comments: {corr:.3f} (expected: positive)")

## 7. Quick Visualizations

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 1. Influencer Tier Distribution
tier_order = ["nano", "micro", "mid", "macro", "mega"]
tier_counts = influencers_df["tier"].value_counts().reindex(tier_order)
axes[0, 0].bar(tier_counts.index, tier_counts.values, color=sns.color_palette("husl", 5))
axes[0, 0].set_title("Influencer Tier Distribution", fontweight="bold")
axes[0, 0].set_ylabel("Count")

# 2. Engagement Rate by Tier
sns.boxplot(data=influencers_df, x="tier", y="engagement_rate", order=tier_order, ax=axes[0, 1], palette="husl")
axes[0, 1].set_title("Engagement Rate by Tier", fontweight="bold")
axes[0, 1].set_ylabel("Engagement Rate (%)")

# 3. Platform Distribution
platform_counts = influencers_df["platform"].value_counts()
axes[0, 2].pie(platform_counts.values, labels=platform_counts.index, autopct="%1.1f%%", colors=sns.color_palette("husl", 4))
axes[0, 2].set_title("Platform Distribution", fontweight="bold")

# 4. Posts Over Time
posts_df["post_month"] = pd.to_datetime(posts_df["post_date"]).dt.to_period("M")
monthly_posts = posts_df.groupby("post_month").size()
axes[1, 0].plot(monthly_posts.index.astype(str), monthly_posts.values, marker="o", linewidth=2)
axes[1, 0].set_title("Posts Over Time (Seasonality)", fontweight="bold")
axes[1, 0].tick_params(axis="x", rotation=45)
axes[1, 0].set_ylabel("Number of Posts")

# 5. Conversion Order Values
sns.histplot(conversions_df["order_value"], bins=50, ax=axes[1, 1], color="coral")
axes[1, 1].set_title("Order Value Distribution", fontweight="bold")
axes[1, 1].set_xlabel("Order Value (USD)")

# 6. Followers vs Engagement Scatterplot
sample = influencers_df.sample(500)
axes[1, 2].scatter(sample["follower_count"], sample["engagement_rate"], alpha=0.5, c=sample["tier"].map({"nano": 0, "micro": 1, "mid": 2, "macro": 3, "mega": 4}))
axes[1, 2].set_title("Followers vs Engagement (Inverse Correlation)", fontweight="bold")
axes[1, 2].set_xlabel("Follower Count")
axes[1, 2].set_ylabel("Engagement Rate (%)")
axes[1, 2].set_xscale("log")

plt.tight_layout()
plt.savefig("../data/data_validation_charts.png", dpi=150, bbox_inches="tight")
plt.show()

print("\n‚úÖ Validation charts saved to ../data/data_validation_charts.png")

## ‚úÖ Data Generation Complete!

**Generated Datasets:**
- `brands.csv` - 25 fashion brands
- `influencers.csv` - 1,500 influencers
- `posts.csv` - 50,000 social media posts
- `conversions.csv` - 30,000 e-commerce conversions
- `touchpoints.csv` - 100,000 customer journey touchpoints

**Next Steps:**
1. Run `02_eda.ipynb` for Exploratory Data Analysis
2. Run `03_attribution_modeling.ipynb` for multi-touch attribution
3. Run `04_influencer_scoring.ipynb` for influencer effectiveness model