In [2]:
import pandas as pd
import numpy as np
from scipy.stats import linregress
from pathlib import Path

# Split the Twitter data into pilot data and study data

## Load data sets

In [3]:
src = "../data"
fname = "KnowWho_profiles_merged_twitter.csv"
knowwho_profiles = pd.read_csv(
    Path(src, fname), 
    dtype={"author_id":str},
    parse_dates=["created_at", "ELECTIONDATE"]
)

In [4]:
fname = "primaries_for_margin_research_RW_JL.csv"
margins = pd.read_csv(Path(src, fname), parse_dates=["ELECTIONDATE"], dayfirst=True)
margins = margins.drop(columns="COMPLETE")
margins = margins.dropna(subset=["handle"])
margins = margins.set_index(["handle", "ELECTIONDATE"])
margins.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,PARTY,STATEABBR,RACE,DISTRICT,VOTE_SHARE,NOTES,NONPARTISAN_PRIMARY,DISTRICT_CORR,STATE_CORR,RUNOFF_PERC,RANKED_CHOICE_VOTING
handle,ELECTIONDATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
carlforalabama,2022-05-24,Jerry L. Carl,R,AL,US Representative,1.0,,C,,,,,
terellanderson,2022-05-24,Reginald Terell Anderson,D,AL,US Representative,2.0,,WD,,,,,
phyllisdhhall,2022-05-24,Phyllis Harvey-Hall,D,AL,US Representative,2.0,68.8,,,,,,


In [5]:
fname = "combined_midterm_candidate_timelines_2022-01-01_to_2023-05-01_clean.csv.gzip"
cols = ["id", "author_id", "created_at", "expanded_urls", "retweeted", "quoted",
        "reply", "text", "retweet_count", "reply_count", "like_count",
        "quote_count"]
tweets = pd.read_csv(
    Path(src, fname),
    dtype={"id":str, "author_id":str},
    parse_dates=["created_at"],
    compression="gzip",
    usecols=cols
)

## Data wrangling

In [6]:
# retain only candidates that ran
knowwho_profiles = knowwho_profiles[knowwho_profiles["STATUS"] == "Running"]\
    .reset_index(drop=True)
# retain only candidates that have a Twitter profile
knowwho_profiles = knowwho_profiles.dropna(subset=["handle"])
# drop unnecessary columns
knowwho_profiles = knowwho_profiles.drop(columns=["STATUS", "ELECTIONCODE"])
len(knowwho_profiles)

1204

In [7]:
# incumbents are coded as "Y" if candidate is an incumbent and as NA if not
# transform this into a binary coding
knowwho_profiles["INCUMBENT"] = knowwho_profiles["INCUMBENT"].fillna(0)
knowwho_profiles["INCUMBENT"] = knowwho_profiles["INCUMBENT"].replace({"Y":1})

  knowwho_profiles["INCUMBENT"] = knowwho_profiles["INCUMBENT"].replace({"Y":1})


In [8]:
# double index is necessary since one candidate ran in two elections that are
# differentiated by their dates
knowwho_profiles = knowwho_profiles.set_index(["handle", "ELECTIONDATE"])
knowwho_profiles = knowwho_profiles.sort_index()

In [9]:
# add ranked-choice voting entries, notes, non-partisan election entries,
# percentages from runoff votes, and vote shares
knowwho_profiles["RANKED_CHOICE_VOTING"] = np.nan
knowwho_profiles["NOTES"] = np.nan
knowwho_profiles["NOTES"] = knowwho_profiles["NOTES"].astype(object)
knowwho_profiles["NONPARTISAN_PRIMARY"] = np.nan
knowwho_profiles["RUNOFF_PERC"] = np.nan
knowwho_profiles["VOTE_SHARE"] = np.nan

for handle, electiondate in knowwho_profiles.index:
    knowwho_profiles.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"] = \
        margins.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"].values[0]
    knowwho_profiles.loc[(handle, electiondate), "NOTES"] = \
        margins.loc[(handle, electiondate), "NOTES"].values[0]
    knowwho_profiles.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"] = \
        margins.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"].values[0]
    knowwho_profiles.loc[(handle, electiondate), "VOTE_SHARE"] = \
        margins.loc[(handle, electiondate), "VOTE_SHARE"].values[0]

  margins.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"].values[0]
  margins.loc[(handle, electiondate), "NOTES"].values[0]
  margins.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"].values[0]
  margins.loc[(handle, electiondate), "VOTE_SHARE"].values[0]
  margins.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"].values[0]
  margins.loc[(handle, electiondate), "NOTES"].values[0]
  margins.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"].values[0]
  margins.loc[(handle, electiondate), "VOTE_SHARE"].values[0]
  margins.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"].values[0]
  margins.loc[(handle, electiondate), "NOTES"].values[0]
  margins.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"].values[0]
  margins.loc[(handle, electiondate), "VOTE_SHARE"].values[0]
  margins.loc[(handle, electiondate), "RANKED_CHOICE_VOTING"].values[0]
  margins.loc[(handle, electiondate), "NOTES"].values[0]
  margins.loc[(handle, electiondate), "NONPARTISAN_PRIMARY"].values[0]
  margins.loc[

In [10]:
# add Twitter profile handle back to data frame
fname = "candidate_twitter_profiles.csv"
cols = ["author_id", "handle"]
users = pd.read_csv(
    Path(src, fname),
    dtype={"author_id":str},
    usecols=cols
)
tweets = pd.merge(
    tweets,
    users,
    how="left",
    left_on="author_id",
    right_on="author_id"
)

In [11]:
# add incumbent and party information to the tweet data
cols = ["author_id", "INCUMBENT", "PARTY", "VOTE_SHARE", "RANKED_CHOICE_VOTING"]
tweets = pd.merge(
    tweets,
    knowwho_profiles[cols],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

In [12]:
# add information about whether a tweet happened up to 16 months before an 
# election (pilot data set 1) or at least 16 weeks after an election (pilot 
# data set 2). Tweets that are neither are # part of the study data set
tweets["AFTER_ELECTION"] = np.nan
tweets = tweets.set_index("id")
i = 0
for handle, electiondate in knowwho_profiles.index:
    if i%100 == 0:
        print(f"{i}/{len(knowwho_profiles)}")
    candidate_tweets = tweets[tweets["handle"] == handle]
    candidate_tweets_before_index = candidate_tweets[
        (candidate_tweets["created_at"].dt.date <= electiondate.date() - pd.Timedelta(days=16 * 7))
    ].index
    candidate_tweets_after_index = candidate_tweets[
        (candidate_tweets["created_at"].dt.date >= electiondate.date() + pd.Timedelta(days=16 * 7))
    ].index

    tweets.loc[candidate_tweets_before_index, "AFTER_ELECTION"] = 0
    tweets.loc[candidate_tweets_after_index, "AFTER_ELECTION"] = 1
    i += 1

0/1204
100/1204
200/1204
300/1204
400/1204
500/1204
600/1204
700/1204
800/1204
900/1204
1000/1204
1100/1204
1200/1204


In [13]:
tweets["AFTER_ELECTION"].value_counts()

AFTER_ELECTION
1.0    308122
0.0    118987
Name: count, dtype: int64

In [14]:
# calculate the tweet lenght - needed later to correct the belief-speaking and
# fact-speaking scores for length bias
tweets["tweet_length"] = tweets["text"].apply(lambda x: len(x))

# remove tweets with a length <= 20 characters as they don't get an honesty score
tweets = tweets[tweets["tweet_length"] > 20]

## Create pilot data sets

In [15]:
# extract tweets from at least 16 weeks before or after election dates
pilot_data1 = pd.DataFrame() # data from before elections
pilot_data2 = pd.DataFrame() # data from after elections

i = 0
for handle, electiondate in knowwho_profiles.index:
    if i%100 == 0:
        print(f"{i}/{len(knowwho_profiles)}")
    candidate_tweets = tweets[tweets["handle"] == handle]
    candidate_tweets_before = candidate_tweets[
        (candidate_tweets["created_at"].dt.date <= electiondate.date() - pd.Timedelta(days=16 * 7))
    ].copy()
    candidate_tweets_after = candidate_tweets[
        (candidate_tweets["created_at"].dt.date >= electiondate.date() + pd.Timedelta(days=16 * 7))
    ].copy()
    pilot_data1 = pd.concat([pilot_data1, candidate_tweets_before])
    pilot_data2 = pd.concat([pilot_data2, candidate_tweets_after])
    i += 1

pilot_data1 = pilot_data1.reset_index()
pilot_data2 = pilot_data2.reset_index()

0/1204
100/1204
200/1204
300/1204
400/1204
500/1204
600/1204
700/1204
800/1204
900/1204
1000/1204
1100/1204
1200/1204


In [16]:
pilot_data1.head(2)

Unnamed: 0,id,author_id,created_at,expanded_urls,retweeted,quoted,reply,text,retweet_count,reply_count,like_count,quote_count,handle,INCUMBENT,PARTY,VOTE_SHARE,RANKED_CHOICE_VOTING,AFTER_ELECTION,tweet_length
0,1485665816934686724,434343925,2022-01-24 17:28:27+00:00,['https://www.instagram.com/p/CZHwc_GOTEw/?utm...,False,False,False,A full and fun day with fellow believers of me...,0,1,1,0,1haywood,0.0,R,33.7,,0.0,166
1,1478952190320734209,434343925,2022-01-06 04:50:54+00:00,['https://www.instagram.com/p/CYYDcy6sVjB/?utm...,False,False,False,Great morning spent at Richard Childress Racin...,0,0,0,0,1haywood,0.0,R,33.7,,0.0,128


## Create study data set

In [17]:
study_data = pd.DataFrame()
i = 0
for handle, electiondate in knowwho_profiles.index:
    if i%100 == 0:
        print(f"{i}/{len(knowwho_profiles)}")
    # select only tweets of the candidate from the four weeks before and after 
    # the election date
    candidate_tweets = tweets[tweets["handle"] == handle]
    candidate_tweets_before = candidate_tweets[
        (candidate_tweets["created_at"].dt.date <= electiondate.date()) & \
        (candidate_tweets["created_at"].dt.date >= electiondate.date() - pd.Timedelta(days=28))
    ].copy()
    candidate_tweets_after = candidate_tweets[
        (candidate_tweets["created_at"].dt.date > electiondate.date()) & \
        (candidate_tweets["created_at"].dt.date <= electiondate.date() + pd.Timedelta(days=28))
    ].copy()

    candidate_tweets_before["AFTER_ELECTION"] = 0
    candidate_tweets_after["AFTER_ELECTION"] = 1
    study_data = pd.concat([study_data, candidate_tweets_before, candidate_tweets_after])
    i = i + 1

study_data = study_data.reset_index()

0/1204
100/1204
200/1204
300/1204
400/1204
500/1204
600/1204
700/1204
800/1204
900/1204
1000/1204
1100/1204
1200/1204


In [18]:
study_data.head(2)

Unnamed: 0,id,author_id,created_at,expanded_urls,retweeted,quoted,reply,text,retweet_count,reply_count,like_count,quote_count,handle,INCUMBENT,PARTY,VOTE_SHARE,RANKED_CHOICE_VOTING,AFTER_ELECTION,tweet_length
0,1524220346110337024,434343925,2022-05-11 02:50:23+00:00,['https://www.instagram.com/p/CdZtBgJuvk3/?igs...,False,False,False,Enjoyed Attending Michelle Bardsley’s campaign...,0,0,0,0,1haywood,0.0,R,33.7,,0,172
1,1516921218540179457,434343925,2022-04-20 23:26:16+00:00,[],False,False,False,If I took a shot every time “Ted Budd” has bee...,0,3,7,0,1haywood,0.0,R,33.7,,0,125


## Save raw data sets for honesty score calculation

In [19]:
dst = "../data"

fname = "pilot_data1_text.csv.gzip"
pilot_data1[["text", "id"]].to_csv(Path(dst, fname), compression="gzip", index=False)

fname = "pilot_data2_text.csv.gzip"
pilot_data2[["text", "id"]].to_csv(Path(dst, fname), compression="gzip", index=False)

fname = "study_data_text.csv.gzip"
study_data[["text", "id"]].to_csv(Path(dst, fname), compression="gzip", index=False)

## Save study data set for power analysis

In [20]:
fname = "study_data.csv.gzip"
study_data.drop(columns=["text"]).to_csv(Path(dst, fname), compression="gzip", index=False)

## Add belief-speaking and fact-speaking scores

In [92]:
# note: this needs the sentence embedding transformer model under 
# ../data/utilities/sentence-transformers
! sh label_glove840B_DDR.sh

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["text"].replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df['id'].str.replace('"', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the 

Note: for this to work, the following files need to have been created by the script **label_glove840B_DDR.sh**:
* pilot_data1_honesty_component_scores.csv.gzip
* pilot_data2_honesty_component_scores.csv.gzip

In [93]:
# load the embedding scores for belief-speaking and truth-seeking for the pilot
# data sets
fname = "pilot_data1_honesty_component_scores.csv.gzip"
honesty_scores_pilot1 = pd.read_csv(
    Path(src, fname),
    dtype={"id":str}, 
    compression="gzip"
).rename(columns={"avg_fact_score":"avg_fact_score_raw", "avg_belief_score":"avg_belief_score_raw"})

fname = "pilot_data2_honesty_component_scores.csv.gzip"
honesty_scores_pilot2 = pd.read_csv(
    Path(src, fname),
    dtype={"id":str}, 
    compression="gzip"
).rename(columns={"avg_fact_score":"avg_fact_score_raw", "avg_belief_score":"avg_belief_score_raw"})

# TODO
# honesty scores for study data

In [94]:
honesty_scores_pilot1.head(3)

Unnamed: 0,id,avg_fact_score_raw,avg_belief_score_raw
0,1485665816934686724,0.649483,0.688581
1,1478952190320734209,0.472821,0.551429
2,1481040751987290115,0.562164,0.645549


In [95]:
# add honesty scores to the pilot data sets
pilot_data1 = pd.merge(
    pilot_data1,
    honesty_scores_pilot1,
    how="left",
    left_on="id",
    right_on="id"
)
pilot_data2 = pd.merge(
    pilot_data2,
    honesty_scores_pilot2,
    how="left",
    left_on="id",
    right_on="id"
)

# TODO
# merge honesty scores with study data

# tweets with <20 characters of length don't get an honesty score and are
# dropped
pilot_data1 = pilot_data1.dropna(subset=["avg_belief_score_raw"])
pilot_data2 = pilot_data2.dropna(subset=["avg_belief_score_raw"])
# TODO: study data

In [96]:
# correct the similarity scores for tweet-length effects
def predict_belief_similarity(tweet_length, slope, intercept):
    return intercept + slope * tweet_length

def predict_fact_similarity(tweet_length, slope, intercept):
    return intercept + slope * tweet_length
    
slope_belief_pilot1, intercept_belief_pilot1, rval_belief_pilot1, \
    pval_belief_pilot1, stderr_belief_pilot1 = \
    linregress(pilot_data1["tweet_length"], pilot_data1["avg_belief_score_raw"])
print(f"pilot data 1: belief-speaking slope: {slope_belief_pilot1}, intercept: {intercept_belief_pilot1}")

slope_belief_pilot2, intercept_belief_pilot2, rval_belief_pilot2, \
    pval_belief_pilot2, stderr_belief_pilot2 = \
    linregress(pilot_data2["tweet_length"], pilot_data2["avg_belief_score_raw"])
print(f"pilot data 2: belief-speaking slope: {slope_belief_pilot2}, intercept: {intercept_belief_pilot2}")

# TODO
# fit belief-speaking for study data

slope_fact_pilot1, intercept_fact_pilot1, rval_fact_pilot1, \
    pval_fact_pilot1, stderr_fact_pilot1 = \
    linregress(pilot_data1["tweet_length"], pilot_data1["avg_fact_score_raw"])
print(f"pilot data 1: fact-seeking slope: {slope_fact_pilot1}, intercept: {intercept_fact_pilot1}")

slope_fact_pilot2, intercept_fact_pilot2, rval_fact_pilot2, \
    pval_fact_pilot2, stderr_fact_pilot2 = \
    linregress(pilot_data2["tweet_length"], pilot_data2["avg_fact_score_raw"])
print(f"pilot data 2: fact-seeking slope: {slope_fact_pilot2}, intercept: {intercept_fact_pilot2}")

# TODO
# fit fact-speaking for study data

pilot_data1["avg_belief_score"] = pilot_data1\
    .apply(lambda x: x["avg_belief_score_raw"] - \
           predict_belief_similarity(x["tweet_length"], slope_belief_pilot1, intercept_belief_pilot1), axis=1)
pilot_data2["avg_belief_score"] = pilot_data2\
    .apply(lambda x: x["avg_belief_score_raw"] - \
           predict_belief_similarity(x["tweet_length"], slope_belief_pilot2, intercept_belief_pilot2), axis=1)
# TODO
# add lenght-corrected belief-speaking scores to study data

pilot_data1["avg_fact_score"] = pilot_data1\
    .apply(lambda x: x["avg_fact_score_raw"] - \
           predict_fact_similarity(x["tweet_length"], slope_fact_pilot1, intercept_fact_pilot1), axis=1)
pilot_data2["avg_fact_score"] = pilot_data2\
    .apply(lambda x: x["avg_fact_score_raw"] - \
           predict_fact_similarity(x["tweet_length"], slope_fact_pilot2, intercept_fact_pilot2), axis=1)

# TODO
# add lenght-corrected fact-speaking scores to study data

pilot data 1: belief-speaking slope: 0.00019505237788996623, intercept: 0.6910776636787734
pilot data 2: belief-speaking slope: 0.00020559748726839292, intercept: 0.6883867624218233
pilot data 1: fact-seeking slope: 0.00029827065921299826, intercept: 0.5858255719929714
pilot data 2: fact-seeking slope: 0.00031189888386098556, intercept: 0.5771759127941147


In [97]:
pilot_data1.head(2)

Unnamed: 0,id,author_id,created_at,expanded_urls,retweeted,quoted,reply,text,retweet_count,reply_count,...,INCUMBENT,PARTY,VOTE_SHARE,RANKED_CHOICE_VOTING,AFTER_ELECTION,tweet_length,avg_fact_score_raw,avg_belief_score_raw,avg_belief_score,avg_fact_score
0,1485665816934686724,434343925,2022-01-24 17:28:27+00:00,['https://www.instagram.com/p/CZHwc_GOTEw/?utm...,False,False,False,A full and fun day with fellow believers of me...,0,1,...,0.0,R,33.7,,0.0,166,0.649483,0.688581,-0.034875,0.014145
1,1478952190320734209,434343925,2022-01-06 04:50:54+00:00,['https://www.instagram.com/p/CYYDcy6sVjB/?utm...,False,False,False,Great morning spent at Richard Childress Racin...,0,0,...,0.0,R,33.7,,0.0,128,0.472821,0.551429,-0.164615,-0.151183


## Save processed data sets

In [98]:
dst = "../data"

fname = "pilot_data1.csv.gzip"
# remove tweet texts before saving for data sharing
pilot_data1\
    .drop(columns=["text"])\
    .to_csv(Path(dst, fname), compression="gzip", index=False)

fname = "pilot_data2.csv.gzip"
pilot_data2\
    .drop(columns=["text"])\
    .to_csv(Path(dst, fname), compression="gzip", index=False)

# TODO
# save processed study data

## Descriptive statistics

### Pilot data 1

In [99]:
fname = "pilot_data1.csv.gzip"
pilot_data1 = pd.read_csv(
    Path(dst, fname), 
    compression="gzip",
    dtype={"id":str, "author_id":str},
    parse_dates=["created_at"]
)

In [100]:
len(pilot_data1)

83233

In [101]:
len(pilot_data1[pilot_data1["tweet_length"] > 20])

83233

In [102]:
len(pilot_data1)

83233

In [103]:
len(pilot_data1["author_id"].unique())

850

In [104]:
pilot_data1.drop_duplicates(subset=["author_id"])["PARTY"].value_counts()

PARTY
R    436
D    414
Name: count, dtype: int64

In [105]:
pilot_data1.drop_duplicates(subset=["author_id"])["INCUMBENT"].value_counts()

INCUMBENT
0.0    594
1.0    256
Name: count, dtype: int64

### Pilot data 2

In [106]:
len(pilot_data2)

194774

In [107]:
len(pilot_data2["author_id"].unique())

893

In [108]:
pilot_data2.drop_duplicates(subset=["author_id"])["PARTY"].value_counts()

PARTY
R    459
D    434
Name: count, dtype: int64

In [109]:
pilot_data2.drop_duplicates(subset=["author_id"])["INCUMBENT"].value_counts()

INCUMBENT
0.0    579
1.0    314
Name: count, dtype: int64

### Study data

In [110]:
len(study_data)

111145

In [111]:
len(study_data["author_id"].unique())

999

In [112]:
study_data.drop_duplicates(subset=["author_id"])["PARTY"].value_counts()

PARTY
R    531
D    468
Name: count, dtype: int64

In [113]:
study_data.drop_duplicates(subset=["author_id"])["INCUMBENT"].value_counts()

INCUMBENT
0.0    685
1.0    314
Name: count, dtype: int64

In [114]:
study_data["AFTER_ELECTION"].value_counts()

AFTER_ELECTION
0    65795
1    45350
Name: count, dtype: int64

In [122]:
author_tweet_count_before = study_data[study_data["AFTER_ELECTION"] == 0][["id", "author_id"]]\
    .groupby(["author_id"])\
    .count()
author_tweet_count_after = study_data[study_data["AFTER_ELECTION"] == 1][["id", "author_id"]]\
    .groupby(["author_id"])\
    .count()

In [123]:
author_tweet_count_before["id"].mean()

68.18134715025907

In [124]:
author_tweet_count_before["id"].std()

121.36496032059249

In [125]:
author_tweet_count_after["id"].mean()

50.89786756453423

In [126]:
author_tweet_count_after["id"].std()

112.7505136187428