In [58]:
import pandas as pd
import numpy as np
from scipy.stats import linregress
from pathlib import Path

# Split the Twitter data into pilot data and study data

## Load data sets

In [59]:
src = "../data"
dst = "../data/processed"

In [60]:
fname = "KnowWho_profiles_merged_twitter.csv"
knowwho_profiles = pd.read_csv(
    Path(src, "tmp", fname), 
    dtype={"author_id":str},
    parse_dates=["created_at", "ELECTIONDATE"]
)

The table `primaries_for_margin_research_RW.csv` was coded by Richard Westway (email on 2023-06-14) who added the columns VOTE_SHARE, NOTES, and COMPLETE. The corresponding email chain is archived in `Projects/CSS_honesty/analysis/midterms/reg_report/`.
* VOTE_SHARE: percentage of the votes gathered by the candidate
* NOTES on various corrections and special election types. Legend:
    * A – Candidate advanced
    * C – Primary cancelled and candidate advanced
    * D# – Corrected district id for this candidate
    * DUP – Duplicate candidate entry
    * NL – Candidate not listed in results
    * NP – Vote share from Nonpartisan Primary
    * R# – Candidate advanced to Primary runoff followed by runoff vote share
    * RCV1 – Vote share from 1st round of ranked-choice voting
    * S-## - Corrected state for this candidate
    * UW – Candidate unofficially withdrew
    * WD – Candidate withdrawn or disqualified
* COMPLETE: disregard, all rows are "Y"

JL updated the data to split the "NOTES" column into several columns, saved in the table `primaries_for_margin_research_RW_JL.csv`. The new table contains the following columns (next to the original VOTE_SHARE and updated NOTES):
* DISTRICT_CORR – contains the corrected district number, corresponding D# entries have been removed from the NOTES column
* STATE_CORR – contains the corrected state acronym, corresponding S# entries have been removed from the NOTES column
* NP – contains a 1 if it was a nonpartisan primary, corresponding NP entries have been removed from the NOTES column
* RCV1 – contains a 1 if it was a ranked-choice voting, corresponding RCV1 entries have been removed from the NOTES column
* RUNOFF_PERC – contains the percentage of the runoff vote share, corresponding R# entries have been removed from the NOTES column

In [61]:
fname = "primaries_for_margin_research_RW_JL.csv"
margins = pd.read_csv(Path(src, "raw", fname), parse_dates=["ELECTIONDATE"], dayfirst=True)
margins = margins.drop(columns="COMPLETE")
margins = margins.dropna(subset=["handle"])
margins = margins.set_index(["handle", "ELECTIONDATE"])
margins.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,PARTY,STATEABBR,RACE,DISTRICT,VOTE_SHARE,NOTES,NONPARTISAN_PRIMARY,DISTRICT_CORR,STATE_CORR,RUNOFF_PERC,RANKED_CHOICE_VOTING
handle,ELECTIONDATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
carlforalabama,2022-05-24,Jerry L. Carl,R,AL,US Representative,1.0,,C,,,,,
terellanderson,2022-05-24,Reginald Terell Anderson,D,AL,US Representative,2.0,,WD,,,,,
phyllisdhhall,2022-05-24,Phyllis Harvey-Hall,D,AL,US Representative,2.0,68.8,,,,,,


In [62]:
fname = "combined_midterm_candidate_timelines_2022-01-01_to_2023-05-01_clean.csv.gzip"
cols = ["id", "author_id", "created_at", "retweeted", "quoted",
        "reply", "text", "retweet_count", "reply_count", "like_count",
        "quote_count"]
tweets = pd.read_csv(
    Path(src, "raw", fname),
    dtype={"id":str, "author_id":str},
    parse_dates=["created_at"],
    compression="gzip",
    usecols=cols
)

## Data wrangling

In [63]:
# retain only candidates that ran
knowwho_profiles = knowwho_profiles[knowwho_profiles["STATUS"] == "Running"]\
    .reset_index(drop=True)
# retain only candidates that have a Twitter profile
knowwho_profiles = knowwho_profiles.dropna(subset=["handle"])
# drop unnecessary columns
knowwho_profiles = knowwho_profiles.drop(columns=["STATUS", "ELECTIONCODE"])
len(knowwho_profiles)

1204

In [64]:
# incumbents are coded as "Y" if candidate is an incumbent and as NA if not
# transform this into a binary coding
knowwho_profiles["INCUMBENT"] = knowwho_profiles["INCUMBENT"].fillna(0)
knowwho_profiles["INCUMBENT"] = knowwho_profiles["INCUMBENT"].replace({"Y":1})

  knowwho_profiles["INCUMBENT"] = knowwho_profiles["INCUMBENT"].replace({"Y":1})


In [65]:
# double index is necessary since one candidate ran in two elections that are
# differentiated by their dates
knowwho_profiles = knowwho_profiles.set_index(["handle", "ELECTIONDATE"])

# correct district entries based on manually corrected file
knowwho_profiles = knowwho_profiles.sort_index()
margins = margins.sort_index()
for handle, electiondate in knowwho_profiles.index:
    if not np.isnan(margins.loc[(handle, electiondate), "DISTRICT_CORR"].values[0]):
        district = margins.loc[(handle, electiondate), "DISTRICT_CORR"].values[0]
        knowwho_profiles.loc[(handle, electiondate), "DISTRICT"] = district

In [66]:
# correct state entries based on manually corrected file
for handle, electiondate in knowwho_profiles.index:
    if not np.isnan(margins.loc[(handle, electiondate), "STATE_CORR"].values[0]):
        state = margins.loc[(handle, electiondate), "STATE_CORR"].values[0]
        knowwho_profiles.loc[(handle, electiondate), "STATEABBR"] = state

In [67]:
# add ranked-choice voting entries, notes, non-partisan election entries,
# percentages from runoff votes, and vote shares
knowwho_profiles["RANKED_CHOICE_VOTING"] = np.nan
knowwho_profiles["NOTES"] = np.nan
knowwho_profiles["NOTES"] = knowwho_profiles["NOTES"].astype(object)
knowwho_profiles["NONPARTISAN_PRIMARY"] = np.nan
knowwho_profiles["RUNOFF_PERC"] = np.nan
knowwho_profiles["VOTE_SHARE"] = np.nan

for handle, electiondate in knowwho_profiles.index:
    knowwho_profiles.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"] = \
        margins.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"].values[0]
    knowwho_profiles.loc[(handle, electiondate), "NOTES"] = \
        margins.loc[(handle, electiondate), "NOTES"].values[0]
    knowwho_profiles.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"] = \
        margins.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"].values[0]
    knowwho_profiles.loc[(handle, electiondate), "VOTE_SHARE"] = \
        margins.loc[(handle, electiondate), "VOTE_SHARE"].values[0]

In [68]:
# add Twitter profile handle back to data frame
fname = "candidate_twitter_profiles.csv"
cols = ["author_id", "handle"]
users = pd.read_csv(
    Path(src, "tmp", fname),
    dtype={"author_id":str},
    usecols=cols
)
tweets = pd.merge(
    tweets,
    users,
    how="left",
    left_on="author_id",
    right_on="author_id"
)

In [69]:
# add candidate information to the tweet data
knowwho_profiles = knowwho_profiles.reset_index() 
knowwho_profiles = knowwho_profiles.rename(columns={"created_at":"account_created_at"})
cols = ["author_id", "account_created_at", "followers_count", "following_count",
        "tweet_count", "INCUMBENT", "PARTY", "LEVEL", "CHAMBER", "VOTE_SHARE", 
        "GENDER", "RACE", "ETHNICITY", "NONPARTISAN_PRIMARY",
        "RANKED_CHOICE_VOTING", "ELECTIONDATE", "STATEABBR", "DISTRICT"]
tweets = pd.merge(
    tweets,
    knowwho_profiles[cols],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

In [70]:
# remove all tweets from candidates that were excluded from the candidate list
tweets = tweets.dropna(subset=["ELECTIONDATE"])

In [71]:
# add information about whether a tweet happened up to 16 months before an 
# election (pilot data set 1) or at least 16 weeks after an election (pilot 
# data set 2). Tweets that are neither are # part of the study data set
knowwho_profiles = knowwho_profiles.set_index(["handle", "ELECTIONDATE"])
tweets["AFTER_ELECTION"] = np.nan
tweets = tweets.set_index("id")
i = 0
for handle, electiondate in knowwho_profiles.index:
    if i%100 == 0:
        print(f"{i}/{len(knowwho_profiles)}")
    candidate_tweets = tweets[tweets["handle"] == handle]
    candidate_tweets_before_index = candidate_tweets[
        (candidate_tweets["created_at"].dt.date <= electiondate.date())
    ].index
    candidate_tweets_after_index = candidate_tweets[
        (candidate_tweets["created_at"].dt.date > electiondate.date())
    ].index

    tweets.loc[candidate_tweets_before_index, "AFTER_ELECTION"] = 0
    tweets.loc[candidate_tweets_after_index, "AFTER_ELECTION"] = 1
    i += 1

0/1204
100/1204
200/1204
300/1204
400/1204
500/1204
600/1204
700/1204
800/1204
900/1204
1000/1204
1100/1204
1200/1204


In [72]:
# calculate the tweet length - needed later to correct the belief-speaking and
# fact-speaking scores for length bias
tweets["tweet_length"] = tweets["text"].apply(lambda x: len(x))

# calculate the number of words in a tweet
def count_words(x):
    x = str(x).split()
    x_filter = filter(lambda x: x != '@user', x)
    x_filter = list(x_filter)
    return len(x_filter)
tweets["N_words"] = tweets["text"].apply(count_words)

# remove tweets with 10 or less words
tweets = tweets[tweets["N_words"] > 10]

In [73]:
tweets["AFTER_ELECTION"].value_counts()

AFTER_ELECTION
1.0    341988
0.0    252492
Name: count, dtype: int64

## Create pilot data sets

In [74]:
# extract tweets from at least 16 weeks before or after election dates
pilot_data1 = pd.DataFrame() # data from before elections
pilot_data2 = pd.DataFrame() # data from after elections

i = 0
for handle, electiondate in knowwho_profiles.index:
    if i%100 == 0:
        print(f"{i}/{len(knowwho_profiles)}")
    candidate_tweets = tweets[tweets["handle"] == handle]
    candidate_tweets_before = candidate_tweets[
        (candidate_tweets["created_at"].dt.date <= electiondate.date() - pd.Timedelta(days=16 * 7))
    ].copy()
    candidate_tweets_after = candidate_tweets[
        (candidate_tweets["created_at"].dt.date > electiondate.date() + pd.Timedelta(days=16 * 7))
    ].copy()
    pilot_data1 = pd.concat([pilot_data1, candidate_tweets_before])
    pilot_data2 = pd.concat([pilot_data2, candidate_tweets_after])
    i += 1

pilot_data1 = pilot_data1.reset_index()
pilot_data2 = pilot_data2.reset_index()

0/1204
100/1204
200/1204
300/1204
400/1204
500/1204
600/1204
700/1204
800/1204
900/1204
1000/1204
1100/1204
1200/1204


In [75]:
pilot_data1.head(2)

Unnamed: 0,id,author_id,created_at,retweeted,quoted,reply,text,retweet_count,reply_count,like_count,...,RACE,ETHNICITY,NONPARTISAN_PRIMARY,RANKED_CHOICE_VOTING,ELECTIONDATE,STATEABBR,DISTRICT,AFTER_ELECTION,tweet_length,N_words
0,1485665816934686724,434343925,2022-01-24 17:28:27+00:00,False,False,False,A full and fun day with fellow believers of me...,0,1,1,...,US Representative,White/Caucasian,,,2022-05-17,NC,6.0,0.0,166,25
1,1481040751987290115,434343925,2022-01-11 23:10:06+00:00,False,False,False,Congratulations go out to our second prize win...,1,0,1,...,US Representative,White/Caucasian,,,2022-05-17,NC,6.0,0.0,214,32


## Create study data set

In [76]:
study_data = pd.DataFrame()
i = 0
for handle, electiondate in knowwho_profiles.index:
    if i%100 == 0:
        print(f"{i}/{len(knowwho_profiles)}")
    # select only tweets of the candidate from the four weeks before and after 
    # the election date
    candidate_tweets = tweets[tweets["handle"] == handle]
    candidate_tweets_before = candidate_tweets[
        (candidate_tweets["created_at"].dt.date <= electiondate.date()) & \
        (candidate_tweets["created_at"].dt.date >= electiondate.date() - pd.Timedelta(days=28))
    ].copy()
    candidate_tweets_after = candidate_tweets[
        (candidate_tweets["created_at"].dt.date > electiondate.date()) & \
        (candidate_tweets["created_at"].dt.date <= electiondate.date() + pd.Timedelta(days=28))
    ].copy()

    candidate_tweets_before["AFTER_ELECTION"] = 0
    candidate_tweets_after["AFTER_ELECTION"] = 1
    study_data = pd.concat([study_data, candidate_tweets_before, candidate_tweets_after])
    i = i + 1

study_data = study_data.reset_index()

0/1204
100/1204
200/1204
300/1204
400/1204
500/1204
600/1204
700/1204
800/1204
900/1204
1000/1204
1100/1204
1200/1204


In [77]:
study_data.head(2)

Unnamed: 0,id,author_id,created_at,retweeted,quoted,reply,text,retweet_count,reply_count,like_count,...,RACE,ETHNICITY,NONPARTISAN_PRIMARY,RANKED_CHOICE_VOTING,ELECTIONDATE,STATEABBR,DISTRICT,AFTER_ELECTION,tweet_length,N_words
0,1516921218540179457,434343925,2022-04-20 23:26:16+00:00,False,False,False,If I took a shot every time “Ted Budd” has bee...,0,3,7,...,US Representative,White/Caucasian,,,2022-05-17,NC,6.0,0,125,25
1,1524220346110337024,434343925,2022-05-11 02:50:23+00:00,False,False,False,Enjoyed Attending Michelle Bardsley’s campaign...,0,0,0,...,US Representative,White/Caucasian,,,2022-05-17,NC,6.0,0,172,22


## Save raw data sets for honesty score calculation

In [78]:
fname = "pilot_data1_text.csv.gzip"
pilot_data1[["text", "id"]].to_csv(Path(src, "tmp", fname), compression="gzip", index=False)

fname = "pilot_data2_text.csv.gzip"
pilot_data2[["text", "id"]].to_csv(Path(src, "tmp", fname), compression="gzip", index=False)

fname = "study_data_text.csv.gzip"
study_data[["text", "id"]].to_csv(Path(src, "tmp", fname), compression="gzip", index=False)

## Save study data set for power analysis

In [79]:
fname = "study_data.csv.gzip"
study_data.drop(columns=["text"]).to_csv(Path(dst, fname), compression="gzip", index=False)

## Add belief-speaking and fact-speaking scores

In [80]:
# note: this needs the sentence embedding transformer model under 
# ../data/utilities/sentence-transformers
! sh label_glove840B_DDR.sh

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["text"].replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df['id'].str.replace('"', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the 

Note: for this to work, the following files need to have been created by the script **label_glove840B_DDR.sh** called above:
* data/tmp/pilot_data1_honesty_component_scores.csv.gzip
* data/tmp/pilot_data2_honesty_component_scores.csv.gzip

In [81]:
# load the embedding scores for belief-speaking and truth-seeking for the pilot
# data sets
fname = "pilot_data1_honesty_component_scores.csv.gzip"
honesty_scores_pilot1 = pd.read_csv(
    Path(src, "tmp", fname),
    dtype={"id":str}, 
    compression="gzip"
).rename(columns={"avg_fact_score":"avg_fact_score_raw", "avg_belief_score":"avg_belief_score_raw"})

fname = "pilot_data2_honesty_component_scores.csv.gzip"
honesty_scores_pilot2 = pd.read_csv(
    Path(src, "tmp", fname),
    dtype={"id":str}, 
    compression="gzip"
).rename(columns={"avg_fact_score":"avg_fact_score_raw", "avg_belief_score":"avg_belief_score_raw"})

# TODO
# honesty scores for study data

In [82]:
honesty_scores_pilot1.head(3)

Unnamed: 0,id,avg_fact_score_raw,avg_belief_score_raw
0,1485665816934686724,0.649483,0.688581
1,1481040751987290115,0.562164,0.645549
2,1478952190320734209,0.472821,0.551429


In [83]:
# add honesty scores to the pilot data sets
pilot_data1 = pd.merge(
    pilot_data1,
    honesty_scores_pilot1,
    how="left",
    left_on="id",
    right_on="id"
)
pilot_data2 = pd.merge(
    pilot_data2,
    honesty_scores_pilot2,
    how="left",
    left_on="id",
    right_on="id"
)

# TODO
# merge honesty scores with study data

# tweets with <20 characters of length don't get an honesty score and are
# dropped
pilot_data1 = pilot_data1.dropna(subset=["avg_belief_score_raw"])
pilot_data2 = pilot_data2.dropna(subset=["avg_belief_score_raw"])
# TODO: study data

In [84]:
# correct the similarity scores for tweet-length effects
def predict_belief_similarity(tweet_length, slope, intercept):
    return intercept + slope * tweet_length

def predict_fact_similarity(tweet_length, slope, intercept):
    return intercept + slope * tweet_length
    
slope_belief_pilot1, intercept_belief_pilot1, rval_belief_pilot1, \
    pval_belief_pilot1, stderr_belief_pilot1 = \
    linregress(pilot_data1["tweet_length"], pilot_data1["avg_belief_score_raw"])
print(f"pilot data 1: belief-speaking slope: {slope_belief_pilot1}, intercept: {intercept_belief_pilot1}")

slope_belief_pilot2, intercept_belief_pilot2, rval_belief_pilot2, \
    pval_belief_pilot2, stderr_belief_pilot2 = \
    linregress(pilot_data2["tweet_length"], pilot_data2["avg_belief_score_raw"])
print(f"pilot data 2: belief-speaking slope: {slope_belief_pilot2}, intercept: {intercept_belief_pilot2}")

# TODO
# fit belief-speaking for study data

slope_fact_pilot1, intercept_fact_pilot1, rval_fact_pilot1, \
    pval_fact_pilot1, stderr_fact_pilot1 = \
    linregress(pilot_data1["tweet_length"], pilot_data1["avg_fact_score_raw"])
print(f"pilot data 1: fact-seeking slope: {slope_fact_pilot1}, intercept: {intercept_fact_pilot1}")

slope_fact_pilot2, intercept_fact_pilot2, rval_fact_pilot2, \
    pval_fact_pilot2, stderr_fact_pilot2 = \
    linregress(pilot_data2["tweet_length"], pilot_data2["avg_fact_score_raw"])
print(f"pilot data 2: fact-seeking slope: {slope_fact_pilot2}, intercept: {intercept_fact_pilot2}")

# TODO
# fit fact-speaking for study data

pilot_data1["avg_belief_score"] = pilot_data1\
    .apply(lambda x: x["avg_belief_score_raw"] - \
           predict_belief_similarity(x["tweet_length"], slope_belief_pilot1, intercept_belief_pilot1), axis=1)
pilot_data2["avg_belief_score"] = pilot_data2\
    .apply(lambda x: x["avg_belief_score_raw"] - \
           predict_belief_similarity(x["tweet_length"], slope_belief_pilot2, intercept_belief_pilot2), axis=1)
# TODO
# add lenght-corrected belief-speaking scores to study data

pilot_data1["avg_fact_score"] = pilot_data1\
    .apply(lambda x: x["avg_fact_score_raw"] - \
           predict_fact_similarity(x["tweet_length"], slope_fact_pilot1, intercept_fact_pilot1), axis=1)
pilot_data2["avg_fact_score"] = pilot_data2\
    .apply(lambda x: x["avg_fact_score_raw"] - \
           predict_fact_similarity(x["tweet_length"], slope_fact_pilot2, intercept_fact_pilot2), axis=1)

# TODO
# add lenght-corrected fact-speaking scores to study data

pilot data 1: belief-speaking slope: 0.00019510370746800895, intercept: 0.691071541906381
pilot data 2: belief-speaking slope: 0.0002059938213591441, intercept: 0.6883044038187659
pilot data 1: fact-seeking slope: 0.00029819551293054197, intercept: 0.5858483248376631
pilot data 2: fact-seeking slope: 0.00031231523729518043, intercept: 0.5770618052326558


In [85]:
pilot_data1.head(2)

Unnamed: 0,id,author_id,created_at,retweeted,quoted,reply,text,retweet_count,reply_count,like_count,...,ELECTIONDATE,STATEABBR,DISTRICT,AFTER_ELECTION,tweet_length,N_words,avg_fact_score_raw,avg_belief_score_raw,avg_belief_score,avg_fact_score
0,1485665816934686724,434343925,2022-01-24 17:28:27+00:00,False,False,False,A full and fun day with fellow believers of me...,0,1,1,...,2022-05-17,NC,6.0,0.0,166,25,0.649483,0.688581,-0.034877,0.014134
1,1481040751987290115,434343925,2022-01-11 23:10:06+00:00,False,False,False,Congratulations go out to our second prize win...,1,0,1,...,2022-05-17,NC,6.0,0.0,214,32,0.562164,0.645549,-0.087275,-0.087499


## Save processed data sets

In [86]:
fname = "pilot_data1.csv.gzip"
# remove tweet texts before saving for data sharing
pilot_data1\
    .drop(columns=["text"])\
    .to_csv(Path(dst, fname), compression="gzip", index=False)

fname = "pilot_data2.csv.gzip"
pilot_data2\
    .drop(columns=["text"])\
    .to_csv(Path(dst, fname), compression="gzip", index=False)

# TODO
# save processed study data

## Descriptive statistics

### Pilot data 1

In [87]:
fname = "pilot_data1.csv.gzip"
pilot_data1 = pd.read_csv(
    Path(dst, fname), 
    compression="gzip",
    dtype={"id":str, "author_id":str},
    parse_dates=["created_at"]
)

In [88]:
len(pilot_data1)

83185

In [89]:
len(pilot_data1[pilot_data1["tweet_length"] > 20])

83185

In [90]:
len(pilot_data1)

83185

In [91]:
len(pilot_data1["author_id"].unique())

850

In [92]:
pilot_data1.drop_duplicates(subset=["author_id"])["PARTY"].value_counts()

PARTY
R    436
D    414
Name: count, dtype: int64

In [93]:
pilot_data1.drop_duplicates(subset=["author_id"])["INCUMBENT"].value_counts()

INCUMBENT
0.0    594
1.0    256
Name: count, dtype: int64

### Pilot data 2

In [94]:
len(pilot_data2)

193544

In [95]:
len(pilot_data2["author_id"].unique())

892

In [96]:
pilot_data2.drop_duplicates(subset=["author_id"])["PARTY"].value_counts()

PARTY
R    459
D    433
Name: count, dtype: int64

In [97]:
pilot_data2.drop_duplicates(subset=["author_id"])["INCUMBENT"].value_counts()

INCUMBENT
0.0    578
1.0    314
Name: count, dtype: int64

### Study data

In [98]:
len(study_data)

82598

In [99]:
study_data["AFTER_ELECTION"].value_counts()

AFTER_ELECTION
0    49530
1    33068
Name: count, dtype: int64

In [100]:
candidates = study_data[["handle", "PARTY", "INCUMBENT", "ELECTIONDATE", "id"]]\
    .groupby(["handle", "PARTY", "INCUMBENT", "ELECTIONDATE"])\
    .count()\
    .rename(columns={"id":"N_tweets"})\
    .reset_index()
candidates["ELECTIONDATE"] = pd.to_datetime(candidates["ELECTIONDATE"])
candidates = candidates.set_index(["handle", "ELECTIONDATE"])
candidates.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,PARTY,INCUMBENT,N_tweets
handle,ELECTIONDATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1haywood,2022-05-17,R,0.0,3
aarikarhodes,2022-06-07,D,0.0,95
aazamishervin,2022-06-07,D,0.0,154


In [101]:
len(candidates)

984

In [102]:
candidates["PARTY"].value_counts()

PARTY
R    520
D    464
Name: count, dtype: int64

In [103]:
candidates["INCUMBENT"].value_counts()

INCUMBENT
0.0    672
1.0    312
Name: count, dtype: int64

In [104]:
candidates["N_tweets_before"] = 0
candidates["N_tweets_after"] = 0
for handle, electiondate in candidates.index:
    # select only tweets of the candidate from the four weeks before and after 
    # the election date
    candidate_tweets = tweets[tweets["handle"] == handle]
    candidate_tweets_before = candidate_tweets[
        (candidate_tweets["created_at"].dt.date <= electiondate.date()) & \
        (candidate_tweets["created_at"].dt.date >= electiondate.date() - pd.Timedelta(days=28))
    ]
    candidate_tweets_after = candidate_tweets[
        (candidate_tweets["created_at"].dt.date > electiondate.date()) & \
        (candidate_tweets["created_at"].dt.date <= electiondate.date() + pd.Timedelta(days=28))
    ]
    candidates.loc[(handle, electiondate), "N_tweets_before"] = len(candidate_tweets_before)
    candidates.loc[(handle, electiondate), "N_tweets_after"] = len(candidate_tweets_after)
candidates.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,PARTY,INCUMBENT,N_tweets,N_tweets_before,N_tweets_after
handle,ELECTIONDATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1haywood,2022-05-17,R,0.0,3,2,1
aarikarhodes,2022-06-07,D,0.0,95,73,22
aazamishervin,2022-06-07,D,0.0,154,118,36


In [105]:
candidates["N_tweets_before"].mean()

50.33536585365854

In [106]:
candidates["N_tweets_before"].std()

79.9270479255927

In [107]:
candidates["N_tweets_after"].mean()

33.60569105691057

In [108]:
candidates["N_tweets_after"].std()

68.16284886611133

In [109]:
candidates["N_tweets_before"].sum()

49530

In [110]:
candidates["N_tweets_after"].sum()

33068

In [111]:
candidates.reset_index()["ELECTIONDATE"].value_counts()

ELECTIONDATE
2022-06-07    162
2022-06-28    150
2022-05-17    112
2022-03-01    110
2022-08-02    101
2022-08-23     70
2022-05-24     66
2022-05-03     47
2022-08-09     47
2022-06-14     32
2022-06-21     28
2022-09-13     18
2022-08-04     12
2022-11-08      8
2022-08-16      8
2022-05-10      7
2022-09-06      6
Name: count, dtype: int64