In [163]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

# import functions from utils.py
from utils import (
    calc_moving_avg,
    calc_rolling_sum,
    calc_log,
    calc_lag,
    find_column_names,
    modify_column_names,
)

#min_date = pd.to_datetime("2020-08-09")
#max_date = pd.to_datetime("2021-04-01")
pd.options.display.max_rows = None

In [165]:
# Read in the data  
dfs3 = pd.read_csv("../processed_data/analysis_data/telegram_data_2020-08-09_2021-04-01.csv")
protest = pd.read_csv("../processed_data/acled_data/combined_protest_data.csv")
repression = pd.read_csv("../processed_data/viasna_data/incidents_data.csv")
chat_info = pd.read_csv("../data/chat_names.csv", encoding = "ISO-8859-1")


# convert datetime columns to datetime format
dfs3["date_minsk"] = pd.to_datetime(dfs3["date_minsk"])
protest.event_date = pd.to_datetime(protest.event_date)
repression.date_of_incident = pd.to_datetime(repression.date_of_incident)

In [166]:
# Seperate protest data into different dataframes for district, region, country
# district columns + df
protest=protest.drop(columns=["protest_size_num"])
district_protest_cols = [col for col in protest.columns if not col.startswith(('region', 'country'))]
district_protest = protest.loc[:,district_protest_cols]
district_protest.drop_duplicates(inplace=True)

# regional columns + df
region_protest_cols = ["region_belarus", "event_date"] + [col for col in protest.columns if "region_num" in col]
region_protest = protest.loc[:,region_protest_cols]
region_protest.drop_duplicates(inplace=True)

# country columns + df
country_protest_cols = ["event_date"] + [col for col in protest.columns if "country_num" in col]
country_protest = protest.loc[:,country_protest_cols]
country_protest.drop_duplicates(inplace=True)


In [167]:
# CALCULATE THE DAILY NUMBER OF MESSAGES IN A CHAT GROUP
# There are 59 chats in this data 
chat_daily_num_msgs = dfs3.groupby(['date_minsk', "chat_name", 'chat_labels']).size().reset_index(name="num_msgs")
chat_daily_num_msgs["district_name"] = chat_daily_num_msgs.chat_labels.str.split(",").str[1].str.strip()

In [None]:
#chat_daily_num_msgs.to_csv("../processed_data/analysis_data/chat_daily_num_msgs.csv", index=False)

In [168]:
# number of chats in every district 
num_district_chats = chat_daily_num_msgs.groupby("district_name")['chat_name'].nunique().reset_index(name="num_district_chats")

In [169]:
# CALCULATE THE DAILY NUMBER OF MESSAGES IN A DISTRIC = sum of messages IN CHAT GROUP OF THE DISTRICT
# There are 28 districts in this data 
district_daily_num_msgs = dfs3.groupby(['date_minsk', 'district_name', 'district_labels']).size().reset_index(name="num_msgs")
district_daily_num_msgs.head()

Unnamed: 0,date_minsk,district_name,district_labels,num_msgs
0,2020-08-09,Ashmyany,"Grodno, Ashmyany",2
1,2020-08-09,Asipovichy,"Mogilev, Asipovichy",129
2,2020-08-09,Baranavichy,"Brest, Baranavichy",294
3,2020-08-09,Barysaw,"Minsk, Barysaw",665
4,2020-08-09,Brest,"Brest, Brest",3074


In [170]:
district_daily_num_msgs.shape

(6369, 4)

In [9]:
#district_daily_num_msgs.to_csv("../processed_data/analysis_data/district_daily_num_msgs.csv", index=False)

In [171]:
# fill in missing dates as zeros per district 
# REINDEX

# Group by the district_name column
grouped = district_daily_num_msgs.groupby(['district_name', 'district_labels'])

# Find the minimum and maximum date in each district
min_district_dates = grouped['date_minsk'].min()
max_district_dates = grouped['date_minsk'].max()

# Create a new DataFrame with all dates between the minimum and maximum date in each group
new_dfs = []
for group_name, group_min_date in min_district_dates.iteritems():
    group_max_date = max_district_dates[group_name]
    all_dates = pd.date_range(start=group_min_date, end=group_max_date)
    group_df = pd.DataFrame({'date_minsk': all_dates, 'district_name': group_name[0], 'district_labels':group_name[1]})
    new_df = new_dfs.append(group_df)
    
new_df = pd.concat(new_dfs)

# Merge the new DataFrame with the original DataFrame to add missing dates
merged_df = pd.merge(new_df, district_daily_num_msgs, on=['date_minsk', 'district_name', 'district_labels'], how='left')
merged_df['num_msgs'] = merged_df['num_msgs'].fillna(0)
merged_df['region_name'] = merged_df['district_labels'].str.split(',').str[0]

In [172]:
(merged_df.num_msgs > 0).value_counts()

True     6369
False     149
Name: num_msgs, dtype: int64

In [173]:
merged_df.shape

(6518, 5)

In [174]:
# Merge in district protest data
combined  = merged_df.merge(district_protest, 
                how="left", 
                left_on=["date_minsk","district_name"], 
                right_on=["event_date","district_belarus"])

# Merge in region protest data
combined  = combined.merge(region_protest, 
                how="left", 
                left_on=["region_name", "date_minsk"], 
                right_on=["region_belarus", "event_date"])

# Merge in country protest data
combined  = combined.merge(country_protest, 
                how="left", 
                left_on=["date_minsk"], 
                right_on=["event_date"])




In [175]:
combined.shape

(6518, 29)

In [176]:
# Merge in the repression data 
combined  = combined.merge(repression,
                how="left",
                left_on=["date_minsk", "district_name"],
                right_on=["date_of_incident", "district_translated"])

In [177]:
# drop duplicate columns 
drop_dup_cols = ["district_belarus", "event_date_x", "event_date_y", "event_date", "region_belarus", "district_translated", "date_of_incident"]
combined = combined.drop(columns = drop_dup_cols)

In [178]:
combined.shape

(6518, 25)

In [179]:
combined.columns[5:]

Index(['small_protest_1', 'large_protest_1', 'small_protest_2',
       'large_protest_2', 'small_protest_3', 'large_protest_3',
       'district_total_protest', 'region_num_small_protest_1',
       'region_num_large_protest_1', 'region_num_small_protest_2',
       'region_num_large_protest_2', 'region_num_small_protest_3',
       'region_num_large_protest_3', 'country_num_small_protest_1',
       'country_num_large_protest_1', 'country_num_small_protest_2',
       'country_num_large_protest_2', 'country_num_small_protest_3',
       'country_num_large_protest_3', 'repression_count'],
      dtype='object')

In [180]:
combined[combined.columns[5:]] = combined[combined.columns[5:]].fillna(0).astype(int)

In [181]:
# binary repression indicator  
combined["repression_binary"] = combined["repression_count"].apply(lambda x: 1 if x > 0 else 0)

# binary indicator for Sunday
combined["is_Sunday"] = (combined["date_minsk"].dt.weekday == 6).astype(int)
# Days till Sunday 
combined["days_till_Sunday"] = 6 - combined["date_minsk"].dt.weekday

# Indicators for "focal days" = Sunday and [2,1,0] days before it 
combined["focal_day_3"] = (combined.days_till_Sunday < 3).astype(int)
combined["focal_day_2"] = (combined.days_till_Sunday < 2).astype(int)
combined["focal_day_1"] = (combined.days_till_Sunday < 1).astype(int)

# Indicator for if the district is Minsk
combined["is_Minsk"] = combined.district_name.isin(["Minsk"])


combined = combined.merge(num_district_chats, how="left", on="district_name")


In [182]:
combined.shape

(6518, 33)

In [183]:
# Calculate rolling aggregates 

# Moving average of the number of messages - with window size of 3, 5, 7
combined["num_msg_MA3"] = calc_moving_avg(combined, ["num_msgs"], ["district_name"], 3)
combined["num_msg_MA5"] = calc_moving_avg(combined, ["num_msgs"], ["district_name"], 5)
combined["num_msg_MA7"] = calc_moving_avg(combined, ["num_msgs"], ["district_name"], 7)

# 5 day rolling sum of protests in the district, region and country 
find_cols_with = ['protest']
protest_cols = find_column_names(combined.columns, find_cols_with)
modified_colnames = modify_column_names(protest_cols, modifier="_rollSum5", suffix=True)
combined[modified_colnames] = calc_rolling_sum(combined, protest_cols, ["district_name"], 5)



In [184]:
combined.shape

(6518, 55)

In [185]:
# Calculate logs 
# Calculate logs of variables
find_cols_with = ["repression_count", "num_msg_MA3", "num_msg_MA5", "num_msg_MA7", "num_msgs"]
relevant_cols = find_column_names(combined.columns, find_cols_with)    
modified_colnames = modify_column_names(relevant_cols, modifier="log_", suffix=False)
combined[modified_colnames] = calc_log(combined, relevant_cols)

In [186]:
# Calculate lags of variables 
find_cols_with = ["log_repression_count", "repression_binary", "log_num_msg", "protest_rollSum5"]
relevant_cols = find_column_names(combined.columns, find_cols_with)    
modified_colnames = modify_column_names(relevant_cols, modifier="lag_", suffix=False)
combined[modified_colnames] = calc_lag(combined, relevant_cols, ["district_name"], 1)


In [187]:
combined.shape

(6518, 67)

In [188]:
# Save the final analysis data
combined.to_csv("../processed_data/analysis_data/combined_new.csv", index=False)

In [161]:
# TOTAL NUMBER OF MESSAGES, PROTESTS AND REPRESSION INCIDENTS ACROSS TIME 
totals = (combined
 .groupby("district_name")[["num_msgs", "small_protest_1", "large_protest_1", "repression_count"]].sum()
 .reset_index()
 .merge(num_district_chats, how="left", on="district_name")
 .sort_values("repression_count", ascending=False)
         )
totals = totals[["district_name", "num_msgs", "num_district_chats", "small_protest_1", "large_protest_1", "repression_count"]]
totals.columns = ["District Name", "Telegram posts", "Group chats","Small protests", "Large protests", "Repressed"]

In [162]:
totals

Unnamed: 0,District Name,Telegram posts,Group chats,Small protests,Large protests,Repressed
14,Minsk,1025549.0,20,178,51,16097
23,Smalyavichy,64082.0,2,26,2,1607
4,Brest,154366.0,2,93,9,755
8,Grodno,239357.0,2,126,12,564
7,Gomel,137499.0,2,84,11,387
25,Vitebsk,90121.0,2,90,3,291
11,Maladzyechna,109758.0,2,44,0,281
19,Polotsk,105604.0,2,82,0,136
3,Barysaw,36253.0,1,91,0,123
21,Salihorsk,97720.0,2,40,3,114


In [22]:
# latex for above table 
# print(totals.to_latex(index=False, escape=False, caption="Total number of Telegram posts, Group chats, Protest events and Repression Incidents across districts (August 2020- April 2021)"))



In [23]:
# Chat information 
#chat_info_filt = chat_info[~chat_info.chat_name.isin(l)].reset_index(drop=True)
#chat_info_filt[chat_info_filt.columns[:-1]]

#fig.savefig("../plots/daily_chat_msg_count.pdf", dpi=300, bbox_inches="tight")
#fig.savefig("../plots/daily_district_msg_count.pdf", dpi=300, bbox_inches="tight")

# make threads 
#reply_threads = (
#    replies
#    .groupby('unique_reply_to_msg_id')['message']
#    .agg('\n'.join)
#    .reset_index(name="all_replies",drop=False)
#)
