# Analysis by political views

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import dask
import dask_mongo
import dask.dataframe as dd

from langcodes import Language

In [2]:
sns.set_theme(style='darkgrid')

## Reading dataset in

In [3]:
from IPython.display import display
from dask.diagnostics import ProgressBar

### Users

In [4]:
users = dd.read_parquet('../parquet/users-full.parquet', engine='pyarrow')

In [5]:
users.columns

Index(['id', 'name', 'username', 'location', 'description', 'verified',
       'followers', 'followings', 'listed', 'tweets_liked', 'tweets',
       'created_at'],
      dtype='object')

In [6]:
users.dtypes

id                            int64
name                         object
username                     object
location                     object
description                  object
verified                       bool
followers                     int64
followings                    int64
listed                        int64
tweets_liked                  int64
tweets                        int64
created_at      datetime64[ns, UTC]
dtype: object

In [7]:
users.head(5)

Unnamed: 0,id,name,username,location,description,verified,followers,followings,listed,tweets_liked,tweets,created_at
0,1237027988287471618,Palestina Internacional Broadcast,pbi_es,Palestina,Palestina Internacional Broadcast es la primer...,False,8524,130,40,694,9383,2020-03-09 14:50:40+00:00
1,1407899290790473732,Section Chine,SectionChine,,"Infos sans préjugés sur la Chine, avec une dom...",False,2107,2137,3,8243,457,2021-06-24 06:06:32+00:00
2,6509832,News18,CNNnews18,India,Official Twitter account of CNN-News18. Lightn...,True,4741218,397,6979,202,1028336,2007-06-01 20:31:01+00:00
3,1570222426570608641,Amazing Metaverse,AmazingMeta,Los Angeles,We are Amazing Metaverse! We are venturing int...,False,90,489,0,669,1180,2022-09-15 01:26:48+00:00
4,216893584,Periódico La Visión,lavisionatl,Atlanta Georgia,Empieza tu día informado con Periódico La Visi...,False,3167,949,70,2134,55632,2010-11-18 00:38:14+00:00


In [5]:
def load_checkpoint() -> tuple[set[int], set[int]]:
    with open('../parquet/republican.txt', 'r', encoding='utf-8') as f:
        ids = [id_.strip() for id_ in f.read().strip('\n').split('\n')]
        republican = set(ids)
    
    with open('../parquet/democrat.txt', 'r', encoding='utf-8') as f:
        ids = [id_.strip() for id_ in f.read().strip('\n').split('\n')]
        democrat = set(ids)
    
    return republican, democrat

In [6]:
republican, democrat = load_checkpoint()

### Tweets

In [7]:
tweets = dd.read_parquet('../parquet/tweets.parquet', engine='pyarrow')

In [12]:
tweets.columns

Index(['userid', 'tweetid', 'tweetcreatedts', 'retweetcount', 'text',
       'hashtags', 'language', 'coordinates', 'favorite_count', 'is_retweet',
       'original_tweet_id', 'original_tweet_userid', 'original_tweet_username',
       'in_reply_to_status_id', 'in_reply_to_user_id',
       'in_reply_to_screen_name', 'is_quote_status', 'quoted_status_id',
       'quoted_status_userid', 'quoted_status_username', 'extractedts'],
      dtype='object')

In [13]:
tweets.dtypes

userid                             object
tweetid                            object
tweetcreatedts             datetime64[ns]
retweetcount                        int32
text                               object
hashtags                           object
language                           object
coordinates                        object
favorite_count                      int32
is_retweet                           bool
original_tweet_id                  object
original_tweet_userid              object
original_tweet_username            object
in_reply_to_status_id              object
in_reply_to_user_id                object
in_reply_to_screen_name            object
is_quote_status                      bool
quoted_status_id                   object
quoted_status_userid               object
quoted_status_username             object
extractedts                datetime64[ns]
dtype: object

In [16]:
with ProgressBar(dt=2.0):
    print(tweets.shape[0].compute())

[########################################] | 100% Completed | 217.35 s
132262426


In [14]:
tweets.head(5)

Unnamed: 0,userid,tweetid,tweetcreatedts,retweetcount,text,hashtags,language,coordinates,favorite_count,is_retweet,...,original_tweet_userid,original_tweet_username,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,is_quote_status,quoted_status_id,quoted_status_userid,quoted_status_username,extractedts
0,1237027988287471618,1579622809680035841,2022-10-11,11,Después de profanar la Mezquita de Ibrahim en ...,"[{'text': 'palestina', 'indices': [142, 152]},...",es,,13,True,...,0,,0,0,,True,0,0,,2022-10-11 09:57:02
1,1407899290790473732,1579622809847808000,2022-10-11,1,La voix de son maître 🇺🇸\nAlors que l'#Ukraine...,"[{'text': 'Ukraine', 'indices': [37, 45]}, {'t...",fr,,0,True,...,0,,0,0,,True,0,0,,2022-10-11 02:31:18
2,6509832,1579622810560786432,2022-10-11,1,#WarInUkraine | Putin Confirms Russia Attacked...,"[{'text': 'WarInUkraine', 'indices': [0, 13]},...",en,,5,True,...,0,,0,0,,True,0,0,,2022-10-11 04:37:34
3,1570222426570608641,1579622810619723776,2022-10-11,1,We made a set of heart-shaped pinatas with the...,"[{'text': 'WeStandWithUkraine', 'indices': [16...",en,,4,True,...,0,,0,0,,True,0,0,,2022-10-11 05:41:38
4,20918680,1579622810774749184,2022-10-11,0,Have you listened to @AmandaMcBroom1 &amp; @Mi...,"[{'text': 'GodofWar', 'indices': [80, 89]}, {'...",en,,0,True,...,0,,0,0,,True,0,0,,2022-10-11 02:31:18


## Rewriting data into separate Parquet files for better performance

### Users

We can hold this in memory, as it is quite small ;)

In [8]:
%%time
republican_users = users[users['id'].astype(str).isin(republican)].compute()
democrat_users = users[users['id'].astype(str).isin(democrat)].compute()

CPU times: user 23.2 s, sys: 1.61 s, total: 24.8 s
Wall time: 20.5 s


In [9]:
republican_users['party'] = 'republican'
democrat_users['party'] = 'democrat'

In [10]:
rd_users = pd.concat([republican_users, democrat_users])

### Tweets

In [20]:
rtweets = tweets[tweets['userid'].isin(republican)]
dtweets = tweets[tweets['userid'].isin(democrat)]

In [21]:
with ProgressBar(dt=2.0):
    print('republican tweets:', rtweets.shape[0].compute())
    print('democrat tweets:', dtweets.shape[0].compute())

[########################################] | 100% Completed | 234.30 s
republican tweets: 199178
[########################################] | 100% Completed | 235.30 s
democrat tweets: 729076


In [24]:
with ProgressBar(dt=2.0):
    rtweets.to_parquet(
        '../parquet/republican-tweets.parquet',
        engine='pyarrow',
        compression=None,
        overwrite=True,
    )

[########################################] | 100% Completed | 242.99 s


In [25]:
with ProgressBar(dt=2.0):
    dtweets.to_parquet(
        '../parquet/democrat-tweets.parquet',
        engine='pyarrow',
        compression=None,
        overwrite=True,
    )

[########################################] | 100% Completed | 246.92 s


In [26]:
rtweets = dd.read_parquet('../parquet/republican-tweets.parquet', engine='pyarrow')

In [27]:
dtweets = dd.read_parquet('../parquet/democrat-tweets.parquet', engine='pyarrow')

## Overall Analysis

In [43]:
import plotly.express as px

In [113]:
color_scheme = {
    'republican': '#ee2400',
    'democrat': '#1c94ed'
}

### Tweets Count by Date distribution

In [83]:
%%time
republican_date_distribution = rtweets['tweetcreatedts'].dt.date.value_counts().compute()

CPU times: user 2.96 s, sys: 239 ms, total: 3.2 s
Wall time: 2.59 s


In [84]:
%%time
democrat_date_distribution = dtweets['tweetcreatedts'].dt.date.value_counts().compute()

CPU times: user 3.19 s, sys: 243 ms, total: 3.43 s
Wall time: 2.79 s


In [85]:
date_distribution = pd.DataFrame({
    'party': ['republican'] * republican_date_distribution.shape[0] \
             + ['democrat'] * democrat_date_distribution.shape[0],
    'date': np.concatenate([
        republican_date_distribution.index.values, democrat_date_distribution.index.values
    ]),
    'data': np.concatenate([
        republican_date_distribution.values, democrat_date_distribution.values
    ])
})

In [86]:
px.histogram(
    date_distribution, x='date', y='data', color='party',
    nbins=democrat_date_distribution.shape[0],
    color_discrete_map=color_scheme
)

As we can see we have democrats being more active overall. Besides that, there are several common spikes in the activity. Let's check some of them.

* **July 11th, 2022**
    * is connected with the scandal around Hunter Biden, and it's kind of weird how that have got into our dataset
* **November 15-16th, 2022**
    * missile strike on the territory of Poland
* **December 21-22th, 2022**
    * Zelenskyy's visit to the USA
* **February 25-27th, 2023**
    * seems to be connected with the one year anniversary of full-scale invasion
* **March 18th, 2023**
    * arrest warrant against Putin

### Tweet Count per User Distribution

In [115]:
px.histogram(rd_users, x='tweets', color='party', color_discrete_map=color_scheme)

In [125]:
px.histogram(
    rd_users, x='tweets', color='party',
    color_discrete_map=color_scheme, range_x=[0, 100_000],
    nbins=4_000
)

Apart from a several outliers with over 200k tweets, we have democrats unproportionally outweighing the republicans by the number of tweets.

### Followers Count per User Distribution

In [126]:
px.histogram(rd_users, x='followers', color='party', color_discrete_map=color_scheme)

In [128]:
px.histogram(
    rd_users, x='tweets', color='party',
    color_discrete_map=color_scheme, range_x=[0, 50_000],
    nbins=8_000
)

We can see that this graph is even more steep, and still we can see an unproportional number of followers in relation to political party.

### User Activity

Here we use the number of liked tweets to evaluate the user's activity in the social network

In [130]:
px.histogram(rd_users, x='tweets_liked', color='party', color_discrete_map=color_scheme)

In [134]:
px.histogram(
    rd_users, x='tweets_liked', color='party',
    color_discrete_map=color_scheme, range_x=[0, 100_000],
    nbins=10_000
)

The graph is similar to the previous ones, although, the gap seems to be smaller, than before. This may identify, that there are more bloggers among democrats who do not take active part in the discussions, so the present less activity. And among republicans, on the other hand, there are likely a lot of ordinary people, who rather consume the content, than produce it.

Still the gap is quite big, to state that for sure, but the difference between this and the previous graphs is quite exciting :)

### Users Location

In [22]:
from typing import Optional
import urllib.parse
import requests
import tqdm
import time
io

In [162]:
# does not seem to work well for a lot of requests :(
def locate(location: str) -> Optional[dict]:
    if not location or location.lower() in ['united states', 'usa', 'us', 'the united states']:
        return None

    response = requests.get('https://geocode.maps.co/search', params={'q': location})
    try:
        json = response.json()
    except Exception as ex:
        print(response.content)
        raise ex
    
    if not json:
        return None
    
    return json[0]

In [11]:
rd_users['location'].value_counts()

location
                               6893
United States                  1040
USA                             576
Florida, USA                    398
California, USA                 361
                               ... 
THE GREAT STATE OF FLORIDA        1
Rural America                     1
jupiter Florida                   1
Progressive Dystopia of Ca.       1
Hel                               1
Name: count, Length: 7690, dtype: int64

In [154]:
rd_users['location'].unique()

array(['Global', 'New York, USA', '', ..., 'Madison WI',
       '4th greatest world economy CA', 'Hel'], dtype=object)

In [23]:
states = {"AL": "Alabama", "AK": "Alaska", "AS": "American Samoa", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "DC": "District Of Columbia", "FM": "Federated States Of Micronesia", "FL": "Florida", "GA": "Georgia", "GU": "Guam", "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MH": "Marshall Islands", "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "MP": "Northern Mariana Islands", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon", "PW": "Palau", "PA": "Pennsylvania", "PR": "Puerto Rico", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VI": "Virgin Islands", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"}

In [20]:
location2state = dict()
for location in rd_users['location']:
    for state_code, state_name in states.items():
        regex = f'\\b{re.escape(state_code)}\\b|\\b{re.escape(state_name)}\\b'
        if re.search(regex, location):
            location2state[location] = state_code

In [25]:
rd_users['state'] = rd_users['location'].apply(lambda x: location2state.get(x, None))

In [29]:
rd_users[rd_users['party'] == 'republican']['state'].notna().sum()

4772

In [30]:
rd_users[rd_users['party'] == 'democrat']['state'].notna().sum()

6923

In [55]:
state_records = []
for state_code, state_name in states.items():
    state_mask = rd_users['state'] == state_code

    # scaling so that the overall weight count is the same
    democrats_count = int(rd_users[state_mask & (rd_users['party'] == 'democrat')]['id'].count() * (4772 / 6923))
    republicans_count = rd_users[state_mask & (rd_users['party'] == 'republican')]['id'].count()
    all_count = max(democrats_count + republicans_count, 1)  # if zero, then we will get zero either way

    record = {
        'state_code': state_code,
        'state': state_name,
        'democrats': democrats_count,
        'republicans': republicans_count,
        'percent_democrats': democrats_count / all_count,
        'percent_republicans': republicans_count / all_count
    }
    state_records.append(record)
    
state_distribution = pd.DataFrame.from_records(state_records)
state_distribution.head(5)

Unnamed: 0,state_code,state,democrats,republicans,percent_democrats,percent_republicans
0,AL,Alabama,28,83,0.252252,0.747748
1,AK,Alaska,8,13,0.380952,0.619048
2,AS,American Samoa,0,0,0.0,0.0
3,AZ,Arizona,149,193,0.435673,0.564327
4,AR,Arkansas,29,46,0.386667,0.613333


In [56]:
px.choropleth(
    state_distribution,
    locations='state_code',
    locationmode='USA-states',
    scope='usa',
    color='percent_democrats',
    hover_name='state',
    hover_data=['democrats', 'republicans', 'percent_democrats', 'percent_republicans'],
    range_color=[0.1, 0.9],
    color_continuous_scale='RdBu',
    title='US Twitter Users'
)

### TODO other overall analysis?

## Tweets Analysis

waiting for the functions :)