In [3]:
import pandas as pd
import glob

# SET THE MIN AND MAX DATE FOR THE SUBSET
min_date = pd.to_datetime("2020-08-09")
max_date = pd.to_datetime("2021-04-01")
pd.options.display.max_rows = None

In [5]:
# READ IN ALL THE TELEGRAM DATA
# Subset it by the min/max date given above
files = glob.glob("../../chat_data_collection/processed_data/chat_dfs_csv/*")

df_list = []
for f in files:
    df = pd.read_csv(f)
    df.date_minsk = pd.to_datetime(df.date_minsk)
    df = df.loc[
        (df.date_minsk >= min_date) & (df.date_minsk <= max_date),
    ]
    df_list.append(df)

dfs = pd.concat(
    df_list,
    ignore_index=True,
)

In [9]:
dfs.shape

(2804171, 24)

In [7]:
# REMOVE WHITE SPACES from chat_name column
dfs.chat_name = dfs.chat_name.str.strip()

In [8]:
# Check the maximum and minimum time 
dfs.date_minsk.min(), dfs.date_minsk.max()

(Timestamp('2020-08-09 00:00:00'), Timestamp('2021-04-01 00:00:00'))

In [10]:
# READ IN CHAT INFO (HAND COLLECTED)
chat_info = pd.read_csv("../data/chat_names.csv", encoding = "ISO-8859-1") 
# MERGE IN DISTRICT INFORMATION
dfs2 = dfs.merge(chat_info[["chat_name", "city_name", "district_name", "region_name"]], how="left", on = "chat_name")

In [11]:
# CHECK ALL CHATS HAVE AN ASSOCIATED DISTRICT/REGION
dfs2.district_name.isna().value_counts()

False    2804171
Name: district_name, dtype: int64

In [12]:
# CALCULATE MISSINGNESS
# find the minimum and maximum id/date for all chat groups
chat_missing = dfs2.groupby("chat_name").agg({'id':['min', 'max'], 'date_minsk':['min', 'max']}).reset_index()
chat_missing.columns = ["chat_name", "min_id", "max_id", "min_date", "max_date"]

# calculate the potential number of messages 
chat_missing["num_potential_msgs"] = chat_missing["max_id"] - chat_missing["min_id"] +1

# Join in the number of published messages found 
# -- number of published meaages in chat groups 
chat_num_msgs = dfs2.groupby("chat_name").size().reset_index(name="num_msgs")
# -- merge 
chat_missing = chat_missing.merge(chat_num_msgs, how="left", on="chat_name")

# Number of missing messages 
chat_missing["num_missing_msgs"] = chat_missing.num_potential_msgs - chat_missing.num_msgs

# Percentage of missing messages 
chat_missing["perc_missing_msgs"] =  1- (chat_missing.num_msgs/chat_missing.num_potential_msgs)



In [15]:
chat_missing.sort_values("perc_missing_msgs")

Unnamed: 0,chat_name,min_id,max_id,min_date,max_date,num_potential_msgs,num_msgs,num_missing_msgs,perc_missing_msgs
29,gorodok57,1,1,2020-08-10,2020-08-10,1,1,0,0.0
33,kopische_chat,1,1,2020-09-05,2020-09-05,1,1,0,0.0
10,MolodechnoHOME,248,18855,2020-08-09,2021-04-01,18608,17285,1323,0.071098
45,mozyr_for_life,438,2329,2020-08-09,2021-04-01,1892,1740,152,0.080338
58,sdj_minsk_pcht,24596,68237,2020-08-09,2021-04-01,43642,39421,4221,0.096719
14,SDZOktMinsk,7340,30921,2020-08-09,2021-04-01,23582,21291,2291,0.09715
13,Postavschchina_SDG,11103,32657,2020-08-09,2021-03-30,21555,19316,2239,0.103874
43,mozyr4district,1,12137,2020-09-06,2021-04-01,12137,10009,2128,0.175332
51,oshmyany,466,23554,2020-08-09,2021-04-01,23089,18871,4218,0.182684
15,Svetlogorskchat,23743,88622,2020-08-09,2021-04-01,64880,52499,12381,0.190829


In [16]:
# Save this table on missingness information 
# chat_missing.to_csv("../data/missing_msgs_summary.csv", index=False)

In [17]:
# FILTER OUT CHATS 
# remove chats where 90% of the messages are missing 
# removed 11 chats 
l = ["asanalieva76",
     "mogilev97pro",
     "bobruisk97pro",
     "Kalinkovichi_ne_Dlya_Zhizni", 
     "senitsa",
     "HonestGrodno",
     "suharevo_marshi",
     "poligonby",
     "Braslavforlife", 
     "gorodok57",
     "kopische_chat",
     #"Postavschchina_SDG",      
     ]
dfs3 = dfs2[~dfs2.chat_name.isin(l)].reset_index(drop=True)

In [18]:
dfs3[dfs3.district_name.isna()].head()

Unnamed: 0,id,created_at,message,from_id,fwd_from,reply_to,media,entities,views,forwards,...,from_user_id,media_type,fwd_type,fwd_id,reply_to_msg_id,msg_textUrls,chat_name,city_name,district_name,region_name


In [20]:
# Create labels for each chat at the chat and district level 
dfs3["chat_labels"] = dfs3.region_name + ", " + dfs3.district_name  + ", " +dfs3.chat_name
dfs3["district_labels"] = dfs3.region_name + ", "+ dfs3.district_name 
dfs3.head()

Unnamed: 0,id,created_at,message,from_id,fwd_from,reply_to,media,entities,views,forwards,...,fwd_type,fwd_id,reply_to_msg_id,msg_textUrls,chat_name,city_name,district_name,region_name,chat_labels,district_labels
0,95183,2020-10-06 09:46:54+00:00,Это про Кыргызстан,"{'_': 'PeerUser', 'user_id': 846004028}",,,"{'_': 'MessageMediaPhoto', 'photo': {'_': 'Pho...",[],,,...,,,,[],zavodskoy_97,Minsk,Minsk,Minsk,"Minsk, Minsk, zavodskoy_97","Minsk, Minsk"
1,95182,2020-10-06 09:46:18+00:00,👍,"{'_': 'PeerUser', 'user_id': 744756843}",,"{'_': 'MessageReplyHeader', 'reply_to_msg_id':...",,[],,,...,,,95180.0,[],zavodskoy_97,Minsk,Minsk,Minsk,"Minsk, Minsk, zavodskoy_97","Minsk, Minsk"
2,95181,2020-10-06 09:45:19+00:00,Ну у них там еще и Чуйская долина имеется...,"{'_': 'PeerUser', 'user_id': 846004028}",,"{'_': 'MessageReplyHeader', 'reply_to_msg_id':...",,[],,,...,,,95011.0,[],zavodskoy_97,Minsk,Minsk,Minsk,"Minsk, Minsk, zavodskoy_97","Minsk, Minsk"
3,95179,2020-10-06 09:42:14+00:00,"да потому что там правительство нормальное, сл...","{'_': 'PeerUser', 'user_id': 904674245}",,"{'_': 'MessageReplyHeader', 'reply_to_msg_id':...",,[],,,...,,,95161.0,[],zavodskoy_97,Minsk,Minsk,Minsk,"Minsk, Minsk, zavodskoy_97","Minsk, Minsk"
4,95178,2020-10-06 09:41:52+00:00,"Send Dudes, нам нужны хохлы и киргизы","{'_': 'PeerUser', 'user_id': 60306577}",,,,[],,,...,,,,[],zavodskoy_97,Minsk,Minsk,Minsk,"Minsk, Minsk, zavodskoy_97","Minsk, Minsk"


In [21]:
dfs3.shape

(2758470, 29)

'2020-08-09_2021-04-01'

In [5]:
dfs3 = pd.read_csv("../processed_data/analysis_data/telegram_data_2020-08-09_2021-04-01.csv")

In [6]:
# CALCULATE THE DAILY NUMBER OF MESSAGES IN A CHAT GROUP
# There are 59 chats in this data 
chat_daily_num_msgs = dfs3.groupby(['date_minsk', "chat_name", 'chat_labels']).size().reset_index(name="num_msgs")
chat_daily_num_msgs.date_minsk = pd.to_datetime(chat_daily_num_msgs.date_minsk).dt.date
chat_daily_num_msgs.head()

Unnamed: 0,date_minsk,chat_name,chat_labels,num_msgs
0,2020-08-09,Brest_strana,"Brest, Brest, Brest_strana",2074
1,2020-08-09,Gomel_strana,"Gomel, Gomel, Gomel_strana",554
2,2020-08-09,Gorkidliajizni,"Mogilev, Horki, Gorkidliajizni",80
3,2020-08-09,HPolotsk_strana,"Vitebsk, Polotsk, HPolotsk_strana",838
4,2020-08-09,Lida_strana_chat,"Grodno, Lida, Lida_strana_chat",719


In [7]:
# Save the above data 
#chat_daily_num_msgs.to_csv("../processed_data/analysis_data/chat_daily_num_msgs.csv",index=False)

In [None]:
# Save this data 
# add date to file name
date = str(min_date.date()) + "_" + str(max_date.date())
dfs3.to_csv("../processed_data/analysis_data/telegram_data_" + date + ".csv", index=False)


In [1]:
## take a random sample of 100 rows of this data
#dfs3.sample(100).to_csv("../processed_data/analysis_data/telegram_data_sample_" + date + ".csv", index=False)