# From Posts to Polls: Likes, Time Windows, and Election Prediction

In this notebook we explore how **social media signals** (in particular, Facebook likes) can be combined with simple **machine learning-style aggregations** and **temporal weighting** to approximate the outcome of the **2024 US presidential election**.

We work with the following Facebook dataset:

- `US_2024_800k.csv`

Each row corresponds to a post and contains:

- `p_id` – unique post identifier  
- `conf` – confidence of the classifier  
- `class` – one of `['Neither', 'Anti-Kamala', 'Anti-Trump', 'Pro-Kamala', 'Pro-Trump']`  
- `statistics.like_count` – number of likes for the post  
- `creation_time` – timestamp of the post  
- `date` – date (day-level) of the post  
- `candidate_support` – aggregated candidate label (`Kamala`, `Trump`, or `NaN` for `Neither`), derived from `class`

Starting from these data, we will:

- Describe the distribution of posts, confidence, and likes across classes.
- Aggregate posts into **candidate-level signals** (share of posts, share of likes).
- Compare these signals to **national polling averages** and the **final 2024 popular vote**.
- Study how **time** matters:
  - daily and cumulative like share over time,
  - moving-average windows of different lengths,
  - **exponentially weighted windows** with different time scales (τ),
  - and how these choices affect the final election prediction.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import numpy as np

In [None]:

raw_posts_classification = '/home/ecuser/Python-Networks/US_2024_800k.csv'
df = pd.read_csv(raw_posts_classification)

import pandas as pd
import matplotlib.pyplot as plt

# Ensure creation_time is datetime
df['creation_time'] = pd.to_datetime(df['creation_time'], errors='coerce')

# Optional: drop rows with invalid timestamps
df = df.dropna(subset=['creation_time'])

# Create a date (day) column for grouping
df['date'] = df['creation_time'].dt.floor('D')

### 1. Interactions and number of posts over time

In [None]:
# Aggregate per day
daily = (
    df.groupby('date')
      .agg(
          n_posts=('p_id', 'nunique'),
          total_likes=('statistics.like_count', 'sum')
      )
      .reset_index()
)

display(daily.head())

plt.figure(figsize=(8, 4))
plt.plot(daily['date'], daily['n_posts'])
plt.xlabel('Date')
plt.ylabel('Number of posts')
plt.title('Number of posts over time')
plt.xticks(rotation=45)

# --- Add vertical lines for week boundaries ---

# Normalize to dates (strip any time component)
start = daily['date'].min().normalize()
end = daily['date'].max().normalize()



# 2) (Optional) Lines at the *end* of each week (e.g. Sunday)
#for week_end in pd.date_range(start, end, freq='W-SUN'):
#    plt.axvline(week_end, linestyle='--', alpha=0.4,color='red')

plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(8, 4))
plt.plot(daily['date'], daily['total_likes'])
plt.xlabel('Date')
plt.ylabel('Number of likes')
plt.title('Number of likes over time')
plt.xticks(rotation=45)

# --- Add vertical lines for week boundaries ---

# Normalize to dates (strip any time component)
start = daily['date'].min().normalize()
end = daily['date'].max().normalize()



# 2) (Optional) Lines at the *end* of each week (e.g. Sunday)
#for week_end in pd.date_range(start, end, freq='W-SUN'):
#    plt.axvline(week_end, linestyle='--', alpha=0.4,color='red')

plt.tight_layout()
plt.show()


#### 2. Overall post classification distribution

In [None]:
class_order = ['Neither', 'Anti-Kamala', 'Anti-Trump', 'Pro-Kamala', 'Pro-Trump']

class_counts = df['class'].value_counts().reindex(class_order)
class_props = class_counts / class_counts.sum()

In [None]:
plt.figure(figsize=(8, 4))
class_counts.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Number of posts')
plt.title('Distribution of predicted classes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Drop rows with missing confidence
df_conf = df.dropna(subset=['conf'])

# Build data in the desired class order
data = [df_conf.loc[df_conf['class'] == cls, 'conf'].values
        for cls in class_order]

# Optionally skip completely empty classes
labels = [cls for cls, arr in zip(class_order, data) if len(arr) > 0]
data = [arr for arr in data if len(arr) > 0]

plt.figure(figsize=(8, 4))
plt.boxplot(data, labels=labels, showfliers=False)
plt.xlabel('Class')
plt.ylabel('Confidence')
plt.title('Confidence distribution by class')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
class_order = ['Neither', 'Anti-Kamala', 'Anti-Trump', 'Pro-Kamala', 'Pro-Trump']

# Drop rows with missing confidence
df_conf = df.dropna(subset=['conf'])

for cls in class_order:
    subset = df_conf[df_conf['class'] == cls]['conf']
    if subset.empty:
        continue  # skip classes with no data

    plt.figure(figsize=(6, 4))
    plt.hist(subset, bins=20)
    plt.xlabel('Confidence')
    plt.ylabel('Number of posts')
    plt.title(f'Confidence distribution – {cls}')
    plt.tight_layout()
    plt.show()



#### 2. Overall post classification distribution by number of likes

In [None]:
class_order = ['Neither', 'Anti-Kamala', 'Anti-Trump', 'Pro-Kamala', 'Pro-Trump']

# Make sure class is ordered
df['class'] = pd.Categorical(df['class'], categories=class_order, ordered=True)

# Aggregate likes per class
class_likes = (
    df.groupby('class')
      .agg(
          n_posts=('p_id', 'nunique'),
          total_likes=('statistics.like_count', 'sum'),
          mean_likes=('statistics.like_count', 'mean')
      )
      .reindex(class_order)
)

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(class_likes.index.astype(str), class_likes['total_likes'])
plt.xlabel('Class')
plt.ylabel('Total likes')
plt.title('Total interactions (likes) per class')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 4))
plt.bar(class_likes.index.astype(str), class_likes['mean_likes'])
plt.xlabel('Class')
plt.ylabel('Average likes per post')
plt.title('Average interactions per post by class')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


#### 3. Number of posts over time by class (excluding "Neither")

In [None]:
# Filter out 'Neither'
df_non_neither = df[df['class'] != 'Neither']

# Group by date and class
daily_class_counts = (
    df_non_neither
      .groupby(['date', 'class'])
      .agg(n_posts=('p_id', 'nunique'))
      .reset_index()
)

In [None]:
posts_by_class = daily_class_counts.pivot(
    index='date',
    columns='class',
    values='n_posts'
).fillna(0)

plt.figure(figsize=(10, 5))
for cls in posts_by_class.columns:
    plt.plot(posts_by_class.index, posts_by_class[cls], label=cls)

plt.xlabel('Date')
plt.ylabel('Number of posts')
plt.title('Number of posts over time by class (excluding Neither)')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


#### 4. Reactions (likes) over time by class

In [None]:
daily_class_likes = (
    df_non_neither
      .groupby(['date', 'class'])
      .agg(total_likes=('statistics.like_count', 'sum'))
      .reset_index()
)


In [None]:
likes_by_class = daily_class_likes.pivot(
    index='date',
    columns='class',
    values='total_likes'
).fillna(0)

plt.figure(figsize=(10, 5))
for cls in likes_by_class.columns:
    plt.plot(likes_by_class.index, likes_by_class[cls], label=cls)

plt.xlabel('Date')
plt.ylabel('Total likes')
plt.title('Interactions (likes) over time by class (excluding Neither)')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


#### 5. Aggregated Support: Kamala vs Trump (Pro + Against-Opponent)

In [None]:
# Map detailed classes to candidate support
support_map = {
    'Pro-Kamala': 'Kamala',
    'Anti-Trump': 'Kamala',
    'Pro-Trump': 'Trump',
    'Anti-Kamala': 'Trump',
    'Neither': None  # explicitly ignore
}

# New column with aggregated support
df['candidate_support'] = df['class'].map(support_map)

# Keep only posts that clearly support one side
support_df = df[df['candidate_support'].notna()]

# Counts and normalized percentages
support_counts = support_df['candidate_support'].value_counts()
support_props = support_counts / support_counts.sum()

In [None]:
# --- Real 2024 national popular vote (raw fractions) ---
harris_raw = 0.483   # 48.3%
trump_raw  = 0.498   # 49.8%

total_two = harris_raw + trump_raw

actual_norm = pd.Series(
    {
        'Kamala': harris_raw / total_two,
        'Trump':  trump_raw  / total_two
    },
    name='Election 2024 (popular vote, normalized)'
)

# Ensure same candidate order
candidate_order = ['Kamala', 'Trump']
support_props = support_props.reindex(candidate_order)

# Combine social-media-based support with real results
comparison = pd.concat(
    [
        support_props.rename('Social media (posts)'),
        actual_norm
    ],
    axis=1
)

# Ensure same candidate order
candidate_order = ['Kamala', 'Trump']

# Total likes per candidate (using only Kamala/Trump-aligned posts)
likes_by_candidate = (
    support_df
    .groupby('candidate_support')['statistics.like_count']
    .sum()
    .reindex(candidate_order)
)

# Normalized share of likes (0–1), comparable with your other columns
likes_share = likes_by_candidate / likes_by_candidate.sum()

# Add to comparison
comparison['Likes share (normalized)'] = likes_share

# Raw national poll numbers (as fractions): Cygnal
harris_poll_raw = 0.50   
trump_poll_raw  = 0.47  

# Normalize to just these two (ignore others/undecided)
poll_total_two = harris_poll_raw + trump_poll_raw

poll_norm = pd.Series(
    {
        'Kamala': harris_poll_raw / poll_total_two,
        'Trump':  trump_poll_raw  / poll_total_two
    },
    name='National polls (normalized)'
)

candidate_order = ['Kamala', 'Trump']

# Make sure comparison is ordered
comparison = comparison.reindex(candidate_order)

# Add polls to comparison
comparison['National polls (normalized)'] = poll_norm.reindex(candidate_order)


In [None]:
comparison

In [None]:
comparison = comparison.reindex(['Kamala', 'Trump'])

# Convert to percentage
social = (comparison['Social media (posts)'] * 100).values
polls  = (comparison['National polls (normalized)'] * 100).values
real   = (comparison['Election 2024 (popular vote, normalized)'] * 100).values
labels = comparison.index.to_list()

x = np.arange(len(labels))
width = 0.22  # narrower to fit 3 bars per candidate

colors = {'Kamala': 'blue', 'Trump': 'red'}
bar_colors = [colors[c] for c in labels]

plt.figure(figsize=(7, 4))
ax = plt.gca()

# Social media: filled bars (left)
bars_social = ax.bar(
    x - width,
    social,
    width,
    color=bar_colors,
    alpha=0.8
)

# National polls: hatched bars (center)
bars_polls = ax.bar(
    x,
    polls,
    width,
    facecolor='none',
    edgecolor=bar_colors,
    hatch='//',
    linewidth=1.5
)

# Real vote: dotted outline bars (right)
bars_real = ax.bar(
    x + width,
    real,
    width,
    facecolor='none',
    edgecolor=bar_colors,
    linestyle=':',
    linewidth=2
)

ax.set_ylabel('Share (%)')
ax.set_title('Kamala vs Trump: posts, polls, and 2024 national vote')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_ylim(0, 100)

# Legend by metric
legend_elements = [
    Patch(facecolor='gray',  alpha=0.8,                   label='Social media (posts)'),
    Patch(facecolor='none', edgecolor='gray', hatch='//', label='National polls'),
    Patch(facecolor='none', edgecolor='gray', linestyle=':', linewidth=2,
          label='2024 national vote'),
]
ax.legend(handles=legend_elements, title='Metric')

plt.tight_layout()
plt.show()


In [None]:
comparison = comparison.reindex(['Kamala', 'Trump'])

# Convert to percentage
likes = (comparison['Likes share (normalized)'] * 100).values
polls = (comparison['National polls (normalized)'] * 100).values
real  = (comparison['Election 2024 (popular vote, normalized)'] * 100).values
labels = comparison.index.to_list()

x = np.arange(len(labels))
width = 0.22  # 3 bars per candidate

colors = {'Kamala': 'blue', 'Trump': 'red'}
bar_colors = [colors[c] for c in labels]

plt.figure(figsize=(7, 4))
ax = plt.gca()

# Likes: filled bars (left)
bars_likes = ax.bar(
    x - width,
    likes,
    width,
    color=bar_colors,
    alpha=0.8
)

# National polls: hatched bars (center)
bars_polls = ax.bar(
    x,
    polls,
    width,
    facecolor='none',
    edgecolor=bar_colors,
    hatch='//',
    linewidth=1.5
)

# Real vote: dotted outline bars (right)
bars_real = ax.bar(
    x + width,
    real,
    width,
    facecolor='none',
    edgecolor=bar_colors,
    linestyle=':',
    linewidth=2
)

ax.set_ylabel('Share (%)')
ax.set_title('Kamala vs Trump: likes, polls, and 2024 national vote')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_ylim(0, 100)

# Legend by metric
legend_elements = [
    Patch(facecolor='gray',  alpha=0.8,                   label='Likes share'),
    Patch(facecolor='none', edgecolor='gray', hatch='//', label='National polls'),
    Patch(facecolor='none', edgecolor='gray', linestyle=':', linewidth=2,
          label='2024 national vote'),
]
ax.legend(handles=legend_elements, title='Metric')

plt.tight_layout()
plt.show()



In [None]:
comparison = comparison.reindex(['Kamala', 'Trump'])

# Differences in percentage points vs actual vote
diff_posts = (comparison['Social media (posts)'] -
              comparison['Election 2024 (popular vote, normalized)']) * 100
diff_polls = (comparison['National polls (normalized)'] -
              comparison['Election 2024 (popular vote, normalized)']) * 100
diff_likes = (comparison['Likes share (normalized)'] -
              comparison['Election 2024 (popular vote, normalized)']) * 100

diff_df = pd.DataFrame({
    'Posts': diff_posts,
    'Polls': diff_polls,
    'Likes': diff_likes,
}).reindex(['Kamala', 'Trump'])

labels = diff_df.index.to_list()
x = np.arange(len(labels))
width = 0.25  # 3 bars per candidate

colors = {'Kamala': 'blue', 'Trump': 'red'}
bar_colors = [colors[c] for c in labels]

plt.figure(figsize=(7, 4))
ax = plt.gca()

# Posts diff: solid
bars_posts = ax.bar(
    x - width,
    diff_df['Posts'].values,
    width,
    color=bar_colors,
    alpha=0.8
)

# Polls diff: hatched
bars_polls = ax.bar(
    x,
    diff_df['Polls'].values,
    width,
    facecolor='none',
    edgecolor=bar_colors,
    hatch='//',
    linewidth=1.5
)

# Likes diff: dotted outline
bars_likes = ax.bar(
    x + width,
    diff_df['Likes'].values,
    width,
    facecolor='none',
    edgecolor=bar_colors,
    linestyle=':',
    linewidth=2
)

# Zero line
ax.axhline(0, color='black', linewidth=1)

ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_ylabel('Difference vs vote (percentage points)')
ax.set_title('Posts, polls, and likes vs 2024 vote (by candidate)')

# Legend by metric
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='gray',  alpha=0.8,                   label='Posts'),
    Patch(facecolor='none', edgecolor='gray', hatch='//', label='Polls'),
    Patch(facecolor='none', edgecolor='gray', linestyle=':', linewidth=2,
          label='Likes'),
]
ax.legend(handles=legend_elements, title='Signal')

plt.tight_layout()
plt.show()

#### 6. Like Share Over Time

In [None]:

# Ensure datetime
df['creation_time'] = pd.to_datetime(df['creation_time'], errors='coerce')

# Keep only posts that support one of the two candidates
likes_df = df[df['candidate_support'].notna()].copy()

# Daily granularity (you could also use weekly with .dt.to_period('W'))
likes_df['date'] = likes_df['creation_time'].dt.floor('D')

# Sum likes per day and candidate
daily_likes = (
    likes_df
    .groupby(['date', 'candidate_support'])['statistics.like_count']
    .sum()
    .reset_index()
)

# Pivot to get columns Kamala / Trump
likes_by_day = daily_likes.pivot(
    index='date',
    columns='candidate_support',
    values='statistics.like_count'
).fillna(0)

# Make sure both columns exist
for c in ['Kamala', 'Trump']:
    if c not in likes_by_day.columns:
        likes_by_day[c] = 0

# Compute like share per day
likes_by_day['total'] = likes_by_day['Kamala'] + likes_by_day['Trump']
# Avoid division by zero
likes_by_day = likes_by_day[likes_by_day['total'] > 0]

likes_by_day['kamala_share'] = likes_by_day['Kamala'] / likes_by_day['total']
likes_by_day['trump_share']  = likes_by_day['Trump']  / likes_by_day['total']


In [None]:
plt.figure(figsize=(9, 4))
plt.plot(likes_by_day.index, likes_by_day['kamala_share'], label='Kamala like share', color = 'blue',linewidth=2)
plt.plot(likes_by_day.index, likes_by_day['trump_share'], label='Trump like share',color = 'red', linewidth=2)

# 50% reference line
plt.axhline(0.5, color='black', linestyle='--', linewidth=1)

plt.xlabel('Date')
plt.ylabel('Share of likes')
plt.title('Daily like share over time (Kamala vs Trump)')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Ensure sorted by date
likes_by_day = likes_by_day.sort_index()

# Cumulative likes per candidate
likes_by_day['cum_kamala'] = likes_by_day['Kamala'].cumsum()
likes_by_day['cum_trump']  = likes_by_day['Trump'].cumsum()

# Total cumulative likes
likes_by_day['cum_total'] = likes_by_day['cum_kamala'] + likes_by_day['cum_trump']

# Avoid division by zero just in case
likes_by_day = likes_by_day[likes_by_day['cum_total'] > 0]

# Cumulative like share
likes_by_day['cum_kamala_share'] = likes_by_day['cum_kamala'] / likes_by_day['cum_total']
likes_by_day['cum_trump_share']  = likes_by_day['cum_trump']  / likes_by_day['cum_total']

In [None]:
plt.figure(figsize=(9, 4))
plt.plot(likes_by_day.index, likes_by_day['cum_kamala_share'],
         label='Kamala cumulative like share', linewidth=2, color='blue')
plt.plot(likes_by_day.index, likes_by_day['cum_trump_share'],
         label='Trump cumulative like share', linewidth=2, color='red')

# 50% baseline
plt.axhline(0.5, color='black', linestyle='--', linewidth=1)

plt.xlabel('Date')
plt.ylabel('Cumulative share of likes')
plt.title('Cumulative like share over time (Kamala vs Trump)')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def add_week_boundaries(dates):
    start = dates.min().normalize()
    end = dates.max().normalize()
    for week_start in pd.date_range(start, end, freq='W-MON'):
        plt.axvline(week_start, linestyle=':', alpha=0.3)

# Make sure likes_by_day is sorted
likes_by_day = likes_by_day.sort_index()

plt.figure(figsize=(9, 4))

# Cumulative like share
plt.plot(
    likes_by_day.index,
    likes_by_day['cum_kamala_share'],
    label='Kamala cumulative like share',
    linewidth=2,
    color='blue'
)
plt.plot(
    likes_by_day.index,
    likes_by_day['cum_trump_share'],
    label='Trump cumulative like share',
    linewidth=2,
    color='red'
)

# 50% baseline
plt.axhline(0.5, color='black', linestyle='--', linewidth=1)


# --- Election result stars ---
last_date = likes_by_day.index.max()

kamala_vote = comparison.loc['Kamala', 'Election 2024 (popular vote, normalized)']
trump_vote  = comparison.loc['Trump',  'Election 2024 (popular vote, normalized)']

plt.scatter(
    [last_date],
    [kamala_vote],
    marker='*',
    s=200,
    color='blue',
    edgecolor='black',
    label='Kamala 2024 vote share'
)

plt.scatter(
    [last_date],
    [trump_vote],
    marker='*',
    s=200,
    color='red',
    edgecolor='black',
    label='Trump 2024 vote share'
)

plt.xlabel('Date')
plt.ylabel('Cumulative share of likes')
plt.title('Cumulative like share over time vs 2024 election results')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
window = 7  # days

likes_by_day = likes_by_day.sort_index()

likes_by_day['kamala_share_ma'] = (
    likes_by_day['kamala_share']
    .rolling(window=window, min_periods=1)
    .mean()
)

likes_by_day['trump_share_ma'] = (
    likes_by_day['trump_share']
    .rolling(window=window, min_periods=1)
    .mean()
)

likes_by_day[['kamala_share', 'kamala_share_ma',
              'trump_share', 'trump_share_ma']].head()


In [None]:
plt.figure(figsize=(9, 4))

# Smoothed moving average
plt.plot(
    likes_by_day.index,
    likes_by_day['kamala_share_ma'],
    color='blue',
    linewidth=2,
    label=f'Kamala like share (MA {window}d)'
)
plt.plot(
    likes_by_day.index,
    likes_by_day['trump_share_ma'],
    color='red',
    linewidth=2,
    label=f'Trump like share (MA {window}d)'
)

# 50% reference
plt.axhline(0.5, color='black', linestyle='--', linewidth=1)

add_week_boundaries(likes_by_day.index)

# --- Add election result stars ---
last_date = likes_by_day.index.max()

kamala_vote = comparison.loc['Kamala', 'Election 2024 (popular vote, normalized)']
trump_vote  = comparison.loc['Trump',  'Election 2024 (popular vote, normalized)']

plt.scatter(
    [last_date],
    [kamala_vote],
    marker='*',
    s=200,
    color='blue',
    edgecolor='black',
    label='Kamala 2024 vote share'
)

plt.scatter(
    [last_date],
    [trump_vote],
    marker='*',
    s=200,
    color='red',
    edgecolor='black',
    label='Trump 2024 vote share'
)

plt.xlabel('Date')
plt.ylabel('Share of likes')
plt.title(f'Daily like share (MA {window}d) vs 2024 election results')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.show()



In [None]:

# Real final election shares (in [0, 1])
kamala_real = comparison.loc['Kamala', 'Election 2024 (popular vote, normalized)']
trump_real  = comparison.loc['Trump',  'Election 2024 (popular vote, normalized)']

# Choose window sizes (days)
window_sizes = [1, 3, 5, 7, 10, 14, 21, 30, 37, 45, 60]

results = []

for w in window_sizes:
    # Rolling mean over daily like share
    kam_ma = likes_by_day['kamala_share'].rolling(window=w, min_periods=1).mean()
    trump_ma = likes_by_day['trump_share'].rolling(window=w, min_periods=1).mean()

    # Prediction at the last day for this window
    kamala_pred = kam_ma.iloc[-1]
    trump_pred  = trump_ma.iloc[-1]

    # Difference vs real (in percentage points)
    kamala_diff = (kamala_pred - kamala_real) * 100
    trump_diff  = (trump_pred  - trump_real)  * 100

    results.append({
        'window': w,
        'kamala_diff_pp': kamala_diff,
        'trump_diff_pp':  trump_diff
    })

diff_df = pd.DataFrame(results).set_index('window')

In [None]:
plt.figure(figsize=(7, 4))
x = diff_df.index.values

plt.plot(x, diff_df['kamala_diff_pp'], marker='o', color='blue', label='Kamala')
plt.plot(x, diff_df['trump_diff_pp'],  marker='o', color='red',  label='Trump')

# Zero = perfect match with real vote
plt.axhline(0, color='black', linestyle='--', linewidth=1)

plt.xlabel('Moving average window (days)')
plt.ylabel('Prediction – vote (percentage points)')
plt.title('Effect of smoothing window on like-based prediction')
plt.xticks(x)
plt.legend()
plt.tight_layout()
plt.show()


#### 7. Exponentially Weighted Windows on Likes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

likes_by_day = likes_by_day.sort_index()

kam_likes = likes_by_day['Kamala'].values
trump_likes = likes_by_day['Trump'].values
n_days = len(likes_by_day)
index_date = likes_by_day.index

kamala_real = comparison.loc['Kamala', 'Election 2024 (popular vote, normalized)']
trump_real  = comparison.loc['Trump',  'Election 2024 (popular vote, normalized)']

# expanding windows: 1 day, then 2 days, ..., up to all days
window_sizes = range(1, n_days + 1)

tau = 14.0  # current decay time (in days) → same as your e^{-k} before
#τ is the number of days at which a day’s influence drops to ~37% of today’s influence.
results = []

for w in window_sizes:
    # take last w days
    arr_k = kam_likes[:w]
    arr_t = trump_likes[:w]

    # distances in days from the most recent point: 0,1,...,w-1
    k = np.arange(w-1, -1, -1, dtype=float)

    # exponential decay with time constant tau
    weights = np.exp(-k / tau)

    # weighted likes
    w_k = np.sum(arr_k * weights)
    w_t = np.sum(arr_t * weights)
    total = w_k + w_t

    if total > 0:
        kam_pred = w_k / total
        trump_pred = w_t / total
    else:
        kam_pred = np.nan
        trump_pred = np.nan

    results.append({
        'window': index_date[w-1],
        'kamala_pred': kam_pred,
        'trump_pred':  trump_pred
    })

exp_pred = pd.DataFrame(results).set_index('window')

In [None]:
plt.figure(figsize=(9, 4))

# Like-based predictions with expanding, exponentially weighted window
plt.plot(
    exp_pred.index,
    exp_pred['kamala_pred'],
    label='Kamala like-based prediction (exp. expanding)',
    linewidth=2,
    color='blue'
)
plt.plot(
    exp_pred.index,
    exp_pred['trump_pred'],
    label='Trump like-based prediction (exp. expanding)',
    linewidth=2,
    color='red'
)

# 50% baseline
plt.axhline(0.5, color='black', linestyle='--', linewidth=1)

# Actual 2024 vote shares (horizontal reference lines)
kamala_real = comparison.loc['Kamala', 'Election 2024 (popular vote, normalized)']
trump_real  = comparison.loc['Trump',  'Election 2024 (popular vote, normalized)']


# Week markers
add_week_boundaries(exp_pred.index)

# Stars at the last day for the final prediction vs vote
last_date = exp_pred.index.max()

plt.scatter(
    [last_date],
    [kamala_real],
    marker='*',
    s=200,
    color='blue',
    edgecolor='black',
    label='Kamala actual vote'
)

plt.scatter(
    [last_date],
    [trump_real],
    marker='*',
    s=200,
    color='red',
    edgecolor='black',
    label='Trump actual vote'
)

plt.xlabel('Date (expanding window ends here)')
plt.ylabel('Share (0–1)')
plt.title('Exponentially-weighted expanding like-based prediction vs 2024 vote')
plt.xticks(rotation=45)
plt.ylim(0, 1)
#plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

likes_by_day = likes_by_day.sort_index()

kam_likes = likes_by_day['Kamala'].values
trump_likes = likes_by_day['Trump'].values
n_days = len(likes_by_day)

kamala_real = comparison.loc['Kamala', 'Election 2024 (popular vote, normalized)']
trump_real  = comparison.loc['Trump',  'Election 2024 (popular vote, normalized)']

# Distances (in days) from the most recent point:
# for n_days=5 → k = [4, 3, 2, 1, 0]
k = np.arange(n_days-1, -1, -1, dtype=float)

# Choose tau values (in days) to explore
taus = [1, 2, 3, 5, 7, 10, 14, 20, 30, 45, 60, 90, 120]

tau_results = []

for tau in taus:
    # Exponential weights for this tau
    weights = np.exp(-k / tau)
    weights = weights / weights.sum()

    w_k = np.sum(kam_likes * weights)
    w_t = np.sum(trump_likes * weights)
    total = w_k + w_t

    if total > 0:
        kam_pred = w_k / total
        trump_pred = w_t / total
    else:
        kam_pred = np.nan
        trump_pred = np.nan

    tau_results.append({
        'tau': tau,
        'kamala_pred': kam_pred,
        'trump_pred':  trump_pred,
        'kamala_diff_pp': (kam_pred - kamala_real) * 100,
        'trump_diff_pp':  (trump_pred  - trump_real)  * 100
    })

tau_df = pd.DataFrame(tau_results).set_index('tau')

In [None]:
plt.figure(figsize=(7, 4))

plt.plot(
    tau_df.index,
    tau_df['kamala_diff_pp'],
    marker='o',
    color='blue',
    label='Kamala'
)

plt.plot(
    tau_df.index,
    tau_df['trump_diff_pp'],
    marker='o',
    color='red',
    label='Trump'
)

plt.axhline(0, color='black', linestyle='--', linewidth=1)

plt.xlabel('Tau (days) in exponential decay')
plt.ylabel('Prediction – vote (percentage points)')
plt.title('Effect of tau on like-based final prediction')
plt.xticks(tau_df.index)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
display(tau_df.round(2))

#### Analyses By state

In [None]:
import pandas as pd
likes_state_level = pd.read_csv('/home/ecuser/Python-Networks/2024USElections_likes_by_state.csv')

In [None]:
likes_state_level[''activities', 'content_type', 'creation_time', 'id',
       'is_branded_content', 'lang', 'link_attachment.caption',
       'link_attachment.description', 'link_attachment.link',
       'link_attachment.name', 'match_type', 'mcl_url', 'modified_time',
       'multimedia', 'post_owner.type', 'post_owner.id', 'post_owner.name',
       'post_owner.username', 'shared_post_id', 'statistics.angry_count',
       'statistics.care_count', 'statistics.comment_count',
       'statistics.haha_count', 'statistics.like_count',
       'statistics.love_count', 'statistics.reaction_count',
       'statistics.sad_count', 'statistics.share_count', 'statistics.views',
       'statistics.views_date_last_refreshed', 'statistics.wow_count',
       'surface.type', 'surface.id', 'surface.name', 'surface.username',
       'text', 'page_id', ']