# üéØ Multi-Touch Attribution Modeling

## Social Media ROI Attribution & Influencer Performance Analyzer

This notebook builds attribution models to understand which touchpoints drive conversions:
- First-Touch Attribution
- Last-Touch Attribution
- Linear Attribution
- Time-Decay Attribution
- Position-Based Attribution
- Markov Chain (Data-Driven) Attribution

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries loaded!")

In [None]:
# Load data
data_dir = Path("../data/raw")

conversions = pd.read_csv(data_dir / "conversions.csv")
touchpoints = pd.read_csv(data_dir / "touchpoints.csv")
posts = pd.read_csv(data_dir / "posts.csv")
influencers = pd.read_csv(data_dir / "influencers.csv")

# Parse dates
conversions['conversion_date'] = pd.to_datetime(conversions['conversion_date'])
touchpoints['touchpoint_date'] = pd.to_datetime(touchpoints['touchpoint_date'])

print(f"üìä Loaded {len(conversions):,} conversions and {len(touchpoints):,} touchpoints")

---
## 1. Data Preparation - Build Customer Journeys

In [None]:
# Get touchpoints that led to conversions
converting_touchpoints = touchpoints[touchpoints['contributed_to_conversion'] == True].copy()

# Join with conversion data
journeys = converting_touchpoints.merge(
    conversions[['conversion_id', 'customer_id', 'order_value', 'conversion_date']], 
    on='conversion_id',
    suffixes=('', '_conv')
)

# Sort by customer and date
journeys = journeys.sort_values(['customer_id_conv', 'touchpoint_date'])

print(f"üìç {len(journeys):,} touchpoints in converting journeys")
print(f"üë• {journeys['conversion_id'].nunique():,} unique conversions with touchpoints")

journeys.head()

In [None]:
# Build journey paths
def build_journey_paths(df):
    """Build journey paths for each conversion."""
    paths = []
    
    for conv_id, group in df.groupby('conversion_id'):
        group = group.sort_values('touchpoint_date')
        path = list(group['platform'].values)
        touchpoint_types = list(group['touchpoint_type'].values)
        order_value = group['order_value'].iloc[0]
        
        paths.append({
            'conversion_id': conv_id,
            'path': path,
            'touchpoint_types': touchpoint_types,
            'path_length': len(path),
            'order_value': order_value
        })
    
    return pd.DataFrame(paths)

journey_paths = build_journey_paths(journeys)
print(f"üìä Built {len(journey_paths):,} journey paths")
print(f"\nüìç Average path length: {journey_paths['path_length'].mean():.2f} touchpoints")

journey_paths.head()

---
## 2. Attribution Models

In [None]:
def first_touch_attribution(paths_df):
    """Assign 100% credit to first touchpoint."""
    attribution = defaultdict(float)
    
    for _, row in paths_df.iterrows():
        if len(row['path']) > 0:
            first_channel = row['path'][0]
            attribution[first_channel] += row['order_value']
    
    return dict(attribution)

def last_touch_attribution(paths_df):
    """Assign 100% credit to last touchpoint."""
    attribution = defaultdict(float)
    
    for _, row in paths_df.iterrows():
        if len(row['path']) > 0:
            last_channel = row['path'][-1]
            attribution[last_channel] += row['order_value']
    
    return dict(attribution)

def linear_attribution(paths_df):
    """Assign equal credit to all touchpoints."""
    attribution = defaultdict(float)
    
    for _, row in paths_df.iterrows():
        path = row['path']
        if len(path) > 0:
            credit_per_channel = row['order_value'] / len(path)
            for channel in path:
                attribution[channel] += credit_per_channel
    
    return dict(attribution)

def time_decay_attribution(paths_df, decay_rate=0.5):
    """More credit to recent touchpoints."""
    attribution = defaultdict(float)
    
    for _, row in paths_df.iterrows():
        path = row['path']
        if len(path) > 0:
            # Calculate weights (more recent = higher weight)
            weights = [decay_rate ** (len(path) - 1 - i) for i in range(len(path))]
            total_weight = sum(weights)
            
            for i, channel in enumerate(path):
                credit = row['order_value'] * (weights[i] / total_weight)
                attribution[channel] += credit
    
    return dict(attribution)

def position_based_attribution(paths_df):
    """40% first, 40% last, 20% middle."""
    attribution = defaultdict(float)
    
    for _, row in paths_df.iterrows():
        path = row['path']
        if len(path) == 0:
            continue
        elif len(path) == 1:
            attribution[path[0]] += row['order_value']
        elif len(path) == 2:
            attribution[path[0]] += row['order_value'] * 0.5
            attribution[path[1]] += row['order_value'] * 0.5
        else:
            # First touch: 40%
            attribution[path[0]] += row['order_value'] * 0.4
            # Last touch: 40%
            attribution[path[-1]] += row['order_value'] * 0.4
            # Middle: 20% split
            middle = path[1:-1]
            if len(middle) > 0:
                middle_credit = row['order_value'] * 0.2 / len(middle)
                for channel in middle:
                    attribution[channel] += middle_credit
    
    return dict(attribution)

print("‚úÖ Attribution functions defined!")

In [None]:
# Calculate all attributions
first_touch = first_touch_attribution(journey_paths)
last_touch = last_touch_attribution(journey_paths)
linear = linear_attribution(journey_paths)
time_decay = time_decay_attribution(journey_paths)
position_based = position_based_attribution(journey_paths)

# Combine into DataFrame
all_channels = set(first_touch.keys()) | set(last_touch.keys()) | set(linear.keys())

attribution_df = pd.DataFrame({
    'Channel': list(all_channels),
    'First Touch': [first_touch.get(c, 0) for c in all_channels],
    'Last Touch': [last_touch.get(c, 0) for c in all_channels],
    'Linear': [linear.get(c, 0) for c in all_channels],
    'Time Decay': [time_decay.get(c, 0) for c in all_channels],
    'Position Based': [position_based.get(c, 0) for c in all_channels]
}).set_index('Channel').round(2)

attribution_df = attribution_df.sort_values('Linear', ascending=False)

print("üìä Attribution by Channel (Revenue $)")
print(attribution_df)

In [None]:
# Visualize attribution comparison
fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(attribution_df.index))
width = 0.15

models = ['First Touch', 'Last Touch', 'Linear', 'Time Decay', 'Position Based']
colors = sns.color_palette('husl', len(models))

for i, model in enumerate(models):
    ax.bar(x + i * width, attribution_df[model], width, label=model, color=colors[i])

ax.set_ylabel('Attributed Revenue ($)', fontsize=12)
ax.set_title('Channel Attribution Comparison Across Models', fontweight='bold', fontsize=16)
ax.set_xticks(x + width * 2)
ax.set_xticklabels(attribution_df.index, rotation=45, ha='right')
ax.legend(title='Attribution Model')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../data/attribution_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 3. Markov Chain Attribution (Data-Driven)

In [None]:
def build_transition_matrix(paths_df):
    """Build Markov transition probability matrix."""
    transitions = defaultdict(lambda: defaultdict(int))
    
    for _, row in paths_df.iterrows():
        path = ['Start'] + row['path'] + ['Conversion']
        for i in range(len(path) - 1):
            transitions[path[i]][path[i+1]] += 1
    
    # Convert to probabilities
    transition_probs = {}
    for from_state, to_states in transitions.items():
        total = sum(to_states.values())
        transition_probs[from_state] = {to_state: count/total for to_state, count in to_states.items()}
    
    return transition_probs

def calculate_removal_effect(paths_df, channel_to_remove):
    """Calculate conversion probability with a channel removed."""
    # Filter out journeys containing the channel
    remaining = paths_df[~paths_df['path'].apply(lambda x: channel_to_remove in x)]
    return len(remaining) / len(paths_df) if len(paths_df) > 0 else 0

def markov_attribution(paths_df):
    """Calculate Markov chain attribution using removal effect."""
    # Get all unique channels
    all_channels = set()
    for path in paths_df['path']:
        all_channels.update(path)
    
    # Base conversion rate
    base_conversions = len(paths_df)
    
    # Calculate removal effect for each channel
    removal_effects = {}
    for channel in all_channels:
        paths_without_channel = paths_df[~paths_df['path'].apply(lambda x: channel in x)]
        remaining_conv = len(paths_without_channel)
        removal_effects[channel] = 1 - (remaining_conv / base_conversions) if base_conversions > 0 else 0
    
    # Normalize to sum to 1
    total_effect = sum(removal_effects.values())
    if total_effect > 0:
        normalized = {k: v/total_effect for k, v in removal_effects.items()}
    else:
        normalized = removal_effects
    
    # Apply to total revenue
    total_revenue = paths_df['order_value'].sum()
    markov_attribution = {k: v * total_revenue for k, v in normalized.items()}
    
    return markov_attribution

# Calculate Markov attribution
markov = markov_attribution(journey_paths)
attribution_df['Markov (Data-Driven)'] = pd.Series(markov)
attribution_df = attribution_df.fillna(0).round(2)

print("üìä Attribution by Channel (Including Markov)")
print(attribution_df)

---
## 4. Attribution Insights

In [None]:
# Calculate percentage shares
attribution_pct = attribution_df.div(attribution_df.sum()) * 100

print("üìä Attribution Share (%) by Model")
print(attribution_pct.round(1))

In [None]:
# Visualize difference between models
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

models = ['First Touch', 'Last Touch', 'Linear', 'Time Decay', 'Position Based', 'Markov (Data-Driven)']
colors = sns.color_palette('Set2', len(attribution_df))

for i, model in enumerate(models):
    ax = axes[i//3, i%3]
    data = attribution_pct[model].sort_values(ascending=False)
    ax.pie(data.values, labels=data.index, autopct='%1.1f%%', colors=colors)
    ax.set_title(model, fontweight='bold', fontsize=12)

plt.suptitle('Channel Attribution by Model', fontweight='bold', fontsize=16, y=1.02)
plt.tight_layout()
plt.savefig('../data/attribution_pies.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Key insights
print("="*60)
print("üéØ ATTRIBUTION INSIGHTS")
print("="*60)

for model in ['First Touch', 'Last Touch', 'Linear', 'Markov (Data-Driven)']:
    top_channel = attribution_pct[model].idxmax()
    top_pct = attribution_pct[model].max()
    print(f"\n{model}: {top_channel} ({top_pct:.1f}%)")

# Identify undervalued/overvalued channels
print("\nüìà Channel Valuation Differences:")
for channel in attribution_df.index:
    first = attribution_pct.loc[channel, 'First Touch']
    last = attribution_pct.loc[channel, 'Last Touch']
    diff = last - first
    if abs(diff) > 5:
        direction = "Closer" if diff > 0 else "Introducer"
        print(f"   {channel}: {direction} ({diff:+.1f}% shift from first to last touch)")

---
## 5. Touchpoint Type Analysis

In [None]:
# Analyze touchpoint types in the journey
touchpoint_type_analysis = journeys.groupby('touchpoint_type').agg({
    'touchpoint_id': 'count',
    'order_value': 'sum'
}).rename(columns={'touchpoint_id': 'count'})

touchpoint_type_analysis['avg_value'] = touchpoint_type_analysis['order_value'] / touchpoint_type_analysis['count']
touchpoint_type_analysis = touchpoint_type_analysis.sort_values('order_value', ascending=False)

print("üìç Touchpoint Type Analysis")
print(touchpoint_type_analysis.round(2))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Touchpoint frequency
touchpoint_type_analysis['count'].plot(kind='bar', ax=axes[0], color=sns.color_palette('viridis', len(touchpoint_type_analysis)))
axes[0].set_title('Touchpoint Type Frequency in Converting Journeys', fontweight='bold', fontsize=12)
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Revenue by touchpoint type
touchpoint_type_analysis['order_value'].plot(kind='bar', ax=axes[1], color=sns.color_palette('viridis', len(touchpoint_type_analysis)))
axes[1].set_title('Revenue by Touchpoint Type', fontweight='bold', fontsize=12)
axes[1].set_ylabel('Revenue ($)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../data/touchpoint_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

---
## ‚úÖ Attribution Modeling Complete!

**Key Outputs:**
- Channel attribution across 6 models
- Touchpoint type analysis
- Journey path insights

**Charts saved:**
- `attribution_comparison.png`
- `attribution_pies.png`
- `touchpoint_analysis.png`

**Next: Run `04_influencer_scoring.ipynb` to build influencer effectiveness model**