# Political Views Segmentation

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import dask
import dask_mongo
import dask.dataframe as dd

from langcodes import Language

In [3]:
sns.set_theme(style='darkgrid')

## Connecting to MongoDB

In [42]:
%%time

b = dask_mongo.read_mongo(
    database='ed23db',
    collection='users',
    chunksize=200_000,
    connection_kwargs={
        'host': 'localhost',
        'port': 27017
    }
)

CPU times: user 343 ms, sys: 58.6 ms, total: 402 ms
Wall time: 13min 53s


In [43]:
df = b.to_dataframe()

In [6]:
df.columns

Index(['_id', 'userid', 'username', 'location', 'following', 'followers',
       'totaltweets', 'usercreatedts'],
      dtype='object')

In [7]:
df.dtypes

_id              object
userid           object
username         object
location         object
following        object
followers        object
totaltweets      object
usercreatedts    object
dtype: object

In [10]:
df.head(20)

ERROR! Session/line number was not unique in database. History logging moved to new session 7


Unnamed: 0,_id,userid,username,location,following,followers,totaltweets,usercreatedts
0,642abd59e4c8d692d061b241,1237027988287471618,pbi_es,Palestina,86,6855,6401,2020-03-09 14:50:40
1,642abd59e4c8d692d061b243,1407899290790473732,SectionChine,,1718,1668,335,2021-06-24 06:06:32
2,642abd59e4c8d692d061b245,6509832,CNNnews18,India,399,4730654,997125,2007-06-01 20:31:01
3,642abd59e4c8d692d061b247,1570222426570608641,AmazingMeta,Los Angeles,86,26,276,2022-09-15 01:26:48
4,642abd59e4c8d692d061b249,20918680,CallMeAdamNYC,"New York, NY",2228,2479,13155,2009-02-15 16:24:01
5,642abd59e4c8d692d061b24b,216893584,lavisionatl,Atlanta Georgia,942,3124,52076,2010-11-18 00:38:14
6,642abd59e4c8d692d061b24d,62089976,yunec,"Kraków, Polska",2582,6560,132814,2009-08-01 18:35:04
7,642abd59e4c8d692d061b24f,209520717,olivanoticias,México,381,25068,249266,2010-10-29 10:39:49
8,642abd59e4c8d692d061b251,1573729011364241409,funny0animals,,1,17,307,2022-09-24 17:40:19
9,642abd59e4c8d692d061b253,916681683134447617,MilaPlayporn,,19,8917,51528,2017-10-07 15:08:39


## Number of unique users

In [11]:
%%time

df.shape[0].compute(), df.shape[1]  # all entries

CPU times: user 14min 15s, sys: 23.3 s, total: 14min 38s
Wall time: 14min 1s


(132262426, 8)

In [12]:
%%time

df['userid'].unique().shape[0].compute()  # unique user ids

CPU times: user 17min 37s, sys: 27 s, total: 18min 4s
Wall time: 16min 28s


6538298

## Distribution by followers

In [38]:
exdf = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})

In [41]:
exdf.groupby("Animal", group_keys=True)['Max Speed'].max().value_counts()

Max Speed
380.0    1
26.0     1
Name: count, dtype: int64

In [None]:
%%time

# followers_distribution = df['followers'].value_counts().compute()  # FIXME we have duplicates of users
followers_distribution = df \
    .groupby('userid', group_keys=True, sort=False)['followers'] \
    .max() \
    .value_counts() \
    .compute()

In [16]:
followers_distribution

followers
0          2297034
1          1473672
2          1201882
3          1041046
4           932882
            ...   
1909945          2
406490           2
406493           2
4064974          2
3071384          2
Name: count, Length: 516588, dtype: int64

In [17]:
followers_distribution_idxs = np.argsort(followers_distribution.index.values)
x = followers_distribution.index.values[followers_distribution_idxs]
y = followers_distribution.values[followers_distribution_idxs]

In [18]:
fig, ax = plt.subplots(figsize=(16, 12))
ax.plot(x, y)

ax.set_title('Distribution of Twitter users by followers count')

# ax.set_xticks(np.arange(0, 105, 5), minor=False)
# ax.set_yticks(np.arange(0, 45000, 2500), minor=False)

# ax.set_xlim(-3)
# ax.set_ylim(-1500)

ax.set_xlabel('followers')
ax.set_ylabel('twitter users having this many users')

plt.show()


KeyboardInterrupt



## Distribution by tweets

In [14]:
%%time

# tweet_count_distribution = df['totaltweets'].value_counts().compute()  # FIXME we have duplicates of users
tweet_count_distribution = df \
    .groupby('userid', group_keys=True, sort=False)['totaltweets'] \
    .max() \
    .value_counts() \
    .compute()

CPU times: user 17min 36s, sys: 26 s, total: 18min 2s
Wall time: 17min 1s


## Distribution by location

todo: how do we normalize location names?