In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# Load data
trump = pd.read_pickle("Datasets/trump_sentiment.pkl")
biden = pd.read_pickle("Datasets/biden_sentiment.pkl")
electoral = pd.read_csv("Datasets/Electoral college results.csv")
popular = pd.read_csv("Datasets/Popular vote.csv")
users = pd.read_csv("Datasets/all_users.csv")

# Edit users table
users_state = users.loc[:,["user_id", "state_code"]]

# Edit popular table
popular = popular[["state","dem_votes","rep_votes","stateid"]]
popular["dem_votes"] = popular["dem_votes"].str.replace(",","").astype(float)
popular["rep_votes"] = popular["rep_votes"].str.replace(",","").astype(float)

In [9]:
# Methods
def find_vader_sentiment_trump(row):
    if row.vader_neutral > .95:
        return "neutral"
    elif row.vader_pos > row.vader_negative:
        return "pro_trump"
    else:
        return "anti_trump"
        
def find_vader_sentiment_biden(row):
    if row.vader_neutral > .95:
        return "neutral"
    elif row.vader_pos > row.vader_negative:
        return "pro_biden"
    else:
        return "anti_biden"
    
def find_absa_sentiment_trump(row):
    if row.absa_neutral > .75:
        return "neutral"
    elif row.absa_positive > row.absa_negative:
        return "pro_trump"
    else:
        return "anti_trump"
    
def find_absa_sentiment_biden(row):
    if row.absa_neutral > .75:
        return "neutral"
    elif row.absa_positive > row.absa_negative:
        return "pro_biden"
    else:
        return "anti_biden"
    
def find_aggregated_sentiment(row):
    if (row.sentiment_vader == "pro_biden") or (row.sentiment_vader == "anti_trump"):
        return "biden"
    elif (row.sentiment_vader == "pro_trump") or (row.sentiment_vader == "anti_biden"):
        return "trump"
    else:
        return "neutral"

In [4]:
# Deduce sentiment
trump["sentiment_vader"] = trump.apply(find_vader_sentiment_trump, axis=1)
biden["sentiment_vader"] = biden.apply(find_vader_sentiment_biden, axis=1)
trump["sentiment_absa"] = trump.apply(find_absa_sentiment_trump, axis=1)
biden["sentiment_absa"] = biden.apply(find_absa_sentiment_biden, axis=1)

trump["is_biden_tweet"] = False
biden["is_biden_tweet"] = True


total_trump_biden = pd.concat([trump,biden], axis=0)
relevant_columns = ["tweet_id", "likes", "retweet_count", "user_id",
                    "vader_neutral", "vader_negative", "vader_pos", "vader_compound",
                    "absa_neutral", "absa_negative", "absa_positive",
                    "sentiment_vader", "sentiment_absa",
                    "is_biden_tweet"]
total_trump_biden = total_trump_biden[relevant_columns]

In [7]:
def parse_total_group(grp):
    if len(grp) == 1:
        return grp.iloc[0,:]
    elif len(grp) == 2: # Two tweets, recorded twice because they have #biden and #trump
        output = grp.iloc[0,:].copy(deep=True)
        biden_tweet = grp[grp.is_biden_tweet == True]
        trump_tweet = grp[grp.is_biden_tweet == False]
        if biden_tweet.absa_positive.item() > trump_tweet.absa_positive.item():
            output.sentiment_ansa = "pro_biden"
        else:
            output.sentiment_absa = "pro_trump"
        return pd.Series(output)
    else:
        display("ERROR")
        return grp

# Reduce tweets that show up both as #biden and #trump into a single record
combined_same_tweets = total_trump_biden.groupby("tweet_id").apply(parse_total_group).reset_index(drop=True)
assert(len(combined_same_tweets) == len(total_trump_biden.drop_duplicates(subset="tweet_id")))
total_trump_biden = combined_same_tweets

single_occurence_trump_biden = total_trump_biden[total_trump_biden.tweet_id.duplicated()] # Single occurences
single_occurence_trump_biden.tweet_id.duplicated().sum() # Should be 0

0

In [10]:
#Find final sentiment
total_trump_biden["political_alignment"] = total_trump_biden.apply(find_aggregated_sentiment, axis=1)

display(trump.sentiment_vader.value_counts())
display(biden.sentiment_vader.value_counts())
display(trump.sentiment_absa.value_counts())
display(biden.sentiment_absa.value_counts())

display(total_trump_biden.sentiment_vader.value_counts())
display(total_trump_biden.sentiment_absa.value_counts())

display(total_trump_biden.political_alignment.value_counts())

pro_trump     33261
anti_trump    32831
neutral       20356
Name: sentiment_vader, dtype: int64

pro_biden     34024
anti_biden    19132
neutral       18481
Name: sentiment_vader, dtype: int64

pro_trump     39546
anti_trump    31681
neutral       15221
Name: sentiment_absa, dtype: int64

anti_biden    52193
pro_biden     11548
neutral        7896
Name: sentiment_absa, dtype: int64

neutral       34414
pro_trump     33261
anti_trump    32831
pro_biden     26989
anti_biden    14230
Name: sentiment_vader, dtype: int64

pro_trump     47729
anti_biden    39365
anti_trump    27082
neutral       17396
pro_biden     10153
Name: sentiment_absa, dtype: int64

biden      59820
trump      47491
neutral    34414
Name: political_alignment, dtype: int64

In [11]:
total_trump_biden

Unnamed: 0,tweet_id,likes,retweet_count,user_id,vader_neutral,vader_negative,vader_pos,vader_compound,absa_neutral,absa_negative,absa_positive,sentiment_vader,sentiment_absa,is_biden_tweet,political_alignment
0,1.316529e+18,2,1,8.436472e+06,0.934,0.000,0.066,0.5905,0.425745,0.565474,0.008781,pro_trump,anti_trump,False,trump
1,1.316529e+18,4,3,4.741380e+07,1.000,0.000,0.000,0.0000,0.335969,0.031120,0.632911,neutral,pro_trump,False,neutral
2,1.316529e+18,0,0,1.994033e+07,0.789,0.127,0.084,-0.4696,0.716640,0.272616,0.010743,anti_trump,anti_trump,False,biden
3,1.316529e+18,3,5,1.243315e+18,0.815,0.000,0.185,0.5267,0.556518,0.061273,0.382210,pro_trump,pro_trump,False,trump
4,1.316530e+18,1,1,8.436472e+06,0.858,0.000,0.142,0.7278,0.806175,0.147411,0.046414,pro_trump,neutral,False,trump
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141720,1.325588e+18,4,0,1.237559e+18,0.805,0.000,0.195,0.8805,0.772822,0.170843,0.056335,pro_biden,neutral,True,biden
141721,1.325588e+18,1,0,8.112988e+17,0.799,0.000,0.201,0.7003,0.689923,0.243870,0.066207,pro_biden,anti_biden,True,biden
141722,1.325588e+18,6,1,9.774065e+07,0.812,0.000,0.188,0.6249,0.242271,0.737315,0.020414,pro_trump,anti_trump,False,trump
141723,1.325589e+18,0,0,5.545625e+07,0.854,0.000,0.146,0.3400,0.425018,0.328660,0.246322,pro_biden,anti_biden,True,biden


In [12]:
users_state

Unnamed: 0,user_id,state_code
0,8.436472e+06,OR
1,4.741380e+07,DC
2,1.994033e+07,IL
3,1.243315e+18,CA
4,1.016593e+08,FL
...,...,...
68767,7.228644e+17,DC
68768,1.845746e+07,NY
68769,4.095715e+08,PA
68770,1.914600e+08,PA


In [13]:
# Remove duplicate entries from user_state table
# Num of users with multiple state_codes
display(users_state.drop_duplicates().user_id.duplicated().sum())

dropped = users_state.drop_duplicates() # Drop duplicated id/state
users_state = dropped[~dropped.user_id.duplicated()] # Keep first occurence of ID

17

In [14]:
sentiment_total = total_trump_biden.merge(users_state, how="inner", left_on="user_id", right_on="user_id")

# Create Output Datasets

In [19]:
# Pie chart of popular vote overall
popular_total = popular.loc[0,["state", "dem_votes", "rep_votes"]]
popular_total["dem_percent"] = popular_total["dem_votes"] / (popular_total["dem_votes"] + popular_total["rep_votes"])
popular_total["rep_percent"] = popular_total["rep_votes"] / (popular_total["dem_votes"] + popular_total["rep_votes"])
popular_total["margin"] = popular_total["dem_percent"] - popular_total["rep_percent"]
display(popular_total)

popular_total.to_json("OutputDatasets/popular_overall.json")

state          U.S. Total
dem_votes      81282916.0
rep_votes      74223369.0
dem_percent      0.522699
rep_percent      0.477301
margin           0.045397
Name: 0, dtype: object

In [94]:
# Data for popular vote overall
popular_state = popular.iloc[1:,:][~popular.state.str.endswith("District")].reset_index(drop=True)
popular_state["dem_percent"] = popular_state["dem_votes"] / (popular_state["dem_votes"] + popular_state["rep_votes"])
popular_state["rep_percent"] = popular_state["rep_votes"] / (popular_state["dem_votes"] + popular_state["rep_votes"])
popular_state["margin"] = popular_state["dem_percent"] - popular_state["rep_percent"]

popular_state = popular_state[["stateid", "dem_percent", "rep_percent", "margin"]]
popular_state = popular_state.sort_values(by="stateid")
display(popular_state)

popular_state.to_json("OutputDatasets/popular_state.json")

  popular_state = popular.iloc[1:,:][~popular.state.str.endswith("District")].reset_index(drop=True)


Unnamed: 0,stateid,dem_percent,rep_percent,margin
14,AK,0.447382,0.552618,-0.105237
13,AL,0.370886,0.629114,-0.258227
15,AR,0.357876,0.642124,-0.284249
0,AZ,0.501568,0.498432,0.003137
16,CA,0.649089,0.350911,0.298178
17,CO,0.569383,0.430617,0.138766
18,CT,0.60195,0.39805,0.203901
20,DC,0.94467,0.05533,0.889339
19,DE,0.596267,0.403733,0.192535
1,FL,0.483052,0.516948,-0.033895


In [95]:
def find_state_sentiment(grp):
    values = grp.political_alignment.value_counts()
    total = float(values["biden"] + values["trump"])
    biden_percent = float(values["biden"]) / total
    trump_percent = float(values["trump"]) / total
    return pd.Series({
        "biden_percent": biden_percent,
        "trump_percent": trump_percent,
        "margin": biden_percent - trump_percent
    })

In [96]:
sentiment_total =  sentiment_total[sentiment_total.state_code != "PR"]
sentiment_total.sort_values(by="state_code")

# Sentiment overall
sentiment_overall = find_state_sentiment(sentiment_total)
sentiment_overall.to_json("OutputDatasets/sentiment_overall.json")

# Sentiment by state
sentiment_by_state = sentiment_total.groupby("state_code").apply(find_state_sentiment).reset_index()
sentiment_by_state.to_json("OutputDatasets/sentiment_state.json")

In [124]:
# Check that they are ordered the same
display((popular_state.stateid.values == sentiment_by_state.state_code.values).all())

# Create errors
errors = (sentiment_by_state.margin - popular_state.margin).abs()
sentiment_popular_errors = sentiment_by_state[["state_code"]].copy(deep=True)
sentiment_popular_errors["error"] = errors
sentiment_popular_errors.to_json("OutputDatasets/error_sentiment_popular.json")

True