# Meta Topic Analysis and Topic Data Wrangling

In [None]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Meta Topic Analysis

In [None]:
# Load Data For Analysis
with open('drive/MyDrive/AAA/ActiveChats2023-04-03-cleaned.csv' , "r") as file:
    file.readline()
    ch_names = [name.rstrip() for name in file]
    ch_names = list(set(ch_names))

meta_topic_mapping = pd.read_table('drive/MyDrive/LinkedInTheDark/Meta Topics Mapping.tsv')
lab_topics = pd.read_csv('drive/MyDrive/LinkedInTheDark/labelled_topics.csv', index_col=0)
chat_activity_epoch = pd.read_csv('drive/MyDrive/LinkedInTheDark/chat_activity_per_epoch.csv', index_col=0)
chat_activity_epoch = chat_activity_epoch.rename(columns={"Epoch 1": "Epoch 1 Freq", "Epoch 2": "Epoch 2 Freq", "Epoch 3": "Epoch 3 Freq"}, errors="raise")


with open('drive/MyDrive/AAA/topics_per_class.pkl', 'rb') as f:
  topics_per_class = pickle.load(f)


meta_topic_mapping['Topics'] = meta_topic_mapping['Topics'].apply(lambda x : [int(x) for x in set(x.split(sep = ','))] )

In [None]:
try:
  lab_topics = lab_topics.set_index('Topic')
except:
  pass

In [None]:
# compute topics and total weight per topic per epoch
lab_topics['Total Incidence'] = np.zeros(len(lab_topics.index))

for col in ["Epoch 1 Incidence", "Epoch 2 Incidence", "Epoch 3 Incidence"]:
  lab_topics[col] = np.zeros(len(lab_topics.index))

for chat in ch_names:
  for epoch in ["Epoch 1", "Epoch 2", "Epoch 3"]:
    index_f = f'./Data/Messages/RawData/chat_{chat}_epoch_{epoch}.csv\n'
    for index, row in topics_per_class[topics_per_class["Class"] == index_f].iterrows():
      # For Each Row / Topic in an Epoch
      lab_topics.loc[row['Topic'], (epoch + ' Incidence')] += row['Frequency']

lab_topics['Total Incidence'] = lab_topics['Epoch 1 Incidence'] + lab_topics['Epoch 2 Incidence'] + lab_topics['Epoch 3 Incidence']

lab_topics

Unnamed: 0_level_0,Words,IsBrit,IsExtremist,Total Incidence,Epoch 1 Incidence,Epoch 2 Incidence,Epoch 3 Incidence
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,"resistance, britain, uk, group https, joinchat",True,True,169761.0,14416.0,55094.0,100251.0
0,"https __, pdf, org web, archive org, pdf https",False,False,57362.0,3707.0,20308.0,33347.0
1,"licence, scumbag, young girl, follow ashleasim...",True,True,4585.0,734.0,1969.0,1882.0
2,"white, follow ashleasimonnews, ashleasimonnews...",True,True,2912.0,260.0,1583.0,1069.0
3,"african, immigrants, israel, send, europe",False,True,2890.0,239.0,1027.0,1624.0
...,...,...,...,...,...,...,...
136,"audit, important, contract, smart, trust",False,False,408.0,14.0,52.0,342.0
43,"tommy, rr, rod, goodbye, np",False,False,1041.0,17.0,675.0,349.0
69,"rape, raped, women, rapist, wrote",False,False,147.0,30.0,42.0,75.0
138,"liberal, liberals, leftists, obvious, leftist",False,True,241.0,84.0,114.0,43.0


In [None]:
meta_topic_mapping['Total Incidence'] = np.zeros(len(meta_topic_mapping.index))

for col in ["Epoch 1 Incidence", "Epoch 2 Incidence", "Epoch 3 Incidence"]:
  meta_topic_mapping[col] = np.zeros(len(meta_topic_mapping.index))

for index, row in meta_topic_mapping.iterrows():
  for topic_id in row['Topics']:
    meta_topic_mapping.loc[index, 'Total Incidence'] += lab_topics.loc[topic_id, 'Total Incidence']
    meta_topic_mapping.loc[index, 'Epoch 1 Incidence'] += lab_topics.loc[topic_id, 'Epoch 1 Incidence']
    meta_topic_mapping.loc[index, 'Epoch 2 Incidence'] += lab_topics.loc[topic_id,'Epoch 2 Incidence']
    meta_topic_mapping.loc[index, 'Epoch 3 Incidence'] += lab_topics.loc[topic_id,'Epoch 3 Incidence']

meta_topic_mapping

Unnamed: 0,Meta Topic Name,Topics,Total Incidence,Epoch 1 Incidence,Epoch 2 Incidence,Epoch 3 Incidence
0,Business,"[93, 65, 73, 60, 33, 124, 100, 98, 125]",1990.0,305.0,853.0,832.0
1,Cryptocurrencies & Investment,"[118, 22, 90, 97, 53, 4, 52, 43, 123, 16, 56, 78]",7695.0,747.0,3459.0,3489.0
2,"Technology, Industry & Business","[117, 89, 7, 77, 105, 83, 80, 109, 17, 84, 122...",11946.0,1143.0,6444.0,4359.0
3,Covid & Mobile Phones,"[86, 2, 11, 111, 69]",3426.0,313.0,1757.0,1356.0
4,"UK, Race, LGBT","[57, 76, 113, 66, 107, 20, 12, 112, 5, 106, 85...",8796.0,1016.0,3800.0,3980.0
5,Law & Current Events,"[87, 42, 92, 103, 70, 104, 34]",1913.0,258.0,754.0,901.0
6,"Politics, News & Current Events","[101, 68, 61, 72, 65, 38, 32, 24, 1, 121, 59, ...",24478.0,2174.0,10183.0,12121.0


In [None]:
# Normalize Values
n_meta_topic_mapping = meta_topic_mapping.copy()

for col in ['Total Incidence', 'Epoch 1 Incidence', 'Epoch 2 Incidence', 'Epoch 3 Incidence']:
  n_meta_topic_mapping[col] = meta_topic_mapping[col].apply( lambda x : x / sum(meta_topic_mapping[col]))

n_meta_topic_mapping

Unnamed: 0,Meta Topic Name,Topics,Total Incidence,Epoch 1 Incidence,Epoch 2 Incidence,Epoch 3 Incidence
0,Business,"[93, 65, 73, 60, 33, 124, 100, 98, 125]",0.033032,0.051209,0.031303,0.030772
1,Cryptocurrencies & Investment,"[118, 22, 90, 97, 53, 4, 52, 43, 123, 16, 56, 78]",0.127731,0.12542,0.126936,0.129041
2,"Technology, Industry & Business","[117, 89, 7, 77, 105, 83, 80, 109, 17, 84, 122...",0.198294,0.191907,0.236477,0.161218
3,Covid & Mobile Phones,"[86, 2, 11, 111, 69]",0.056869,0.052552,0.064477,0.050152
4,"UK, Race, LGBT","[57, 76, 113, 66, 107, 20, 12, 112, 5, 106, 85...",0.146006,0.170584,0.13945,0.1472
5,Law & Current Events,"[87, 42, 92, 103, 70, 104, 34]",0.031754,0.043318,0.02767,0.033323
6,"Politics, News & Current Events","[101, 68, 61, 72, 65, 38, 32, 24, 1, 121, 59, ...",0.406314,0.36501,0.373688,0.448295


In [None]:
# Meta Topic Incidence Per Chat
E1_chats_metadata = chat_activity_epoch.filter(['Chat Name','Epoch 1 Freq'], axis=1).set_index('Chat Name')
E2_chats_metadata = chat_activity_epoch.filter(['Chat Name','Epoch 2 Freq'], axis=1).set_index('Chat Name')
E3_chats_metadata = chat_activity_epoch.filter(['Chat Name','Epoch 3 Freq'], axis=1).set_index('Chat Name')

ch_mtd = [E1_chats_metadata, E2_chats_metadata, E3_chats_metadata]


for chat in ch_names:
  i = 0
  for epoch in ['Epoch 1', 'Epoch 2', 'Epoch 3']:
    for mt_index, mt_row in meta_topic_mapping.iterrows():
      ch_mtd[i].loc[chat, 'Normalised ' + mt_row['Meta Topic Name']] = 0.0
      index_f = f'./Data/Messages/RawData/chat_{chat}_epoch_{epoch}.csv\n'
      for index, row in topics_per_class[topics_per_class["Class"] == index_f].iterrows():
        if row['Topic'] in mt_row['Topics']:
          ch_mtd[i].loc[chat, 'Normalised ' + mt_row['Meta Topic Name']] += row['Frequency'] 
      # Normalise by total frequency in that epoch
      ch_mtd[i].loc[chat, 'Normalised ' + mt_row['Meta Topic Name']] /= sum(chat_activity_epoch[epoch + ' Freq'])
    i += 1

In [None]:
for chat in ch_names:
  i = 0
  for epoch in ['Epoch 1', 'Epoch 2', 'Epoch 3']:
    ch_mtd[i].loc[chat, ch_mtd[i].columns != (epoch + ' Freq')] /= ch_mtd[i].loc[chat, ch_mtd[i].columns != (epoch + ' Freq')].sum()
    i += 1

In [None]:
meta_topic_mapping_c = meta_topic_mapping.copy()
columns_to_round = meta_topic_mapping_c.columns.drop(['Meta Topic Name', 'Topics'])

meta_topic_mapping_c[columns_to_round] = meta_topic_mapping[columns_to_round].round().astype(int)

cm = sns.light_palette("blue", as_cmap=True)

style_meta_topic_mapping_c = meta_topic_mapping_c.drop('Topics', axis=1).style.background_gradient(cmap=cm)

display(style_meta_topic_mapping_c)

Unnamed: 0,Meta Topic Name,Total Incidence,Epoch 1 Incidence,Epoch 2 Incidence,Epoch 3 Incidence
0,Business,1990,305,853,832
1,Cryptocurrencies & Investment,7695,747,3459,3489
2,"Technology, Industry & Business",11946,1143,6444,4359
3,Covid & Mobile Phones,3426,313,1757,1356
4,"UK, Race, LGBT",8796,1016,3800,3980
5,Law & Current Events,1913,258,754,901
6,"Politics, News & Current Events",24478,2174,10183,12121


In [None]:
cm = sns.light_palette("red", as_cmap=True)
style_n_meta_topic_mapping = n_meta_topic_mapping.drop('Topics', axis=1).style.background_gradient(cmap=cm)

display(style_n_meta_topic_mapping)

Unnamed: 0,Meta Topic Name,Total Incidence,Epoch 1 Incidence,Epoch 2 Incidence,Epoch 3 Incidence
0,Business,0.033032,0.051209,0.031303,0.030772
1,Cryptocurrencies & Investment,0.127731,0.12542,0.126936,0.129041
2,"Technology, Industry & Business",0.198294,0.191907,0.236477,0.161218
3,Covid & Mobile Phones,0.056869,0.052552,0.064477,0.050152
4,"UK, Race, LGBT",0.146006,0.170584,0.13945,0.1472
5,Law & Current Events,0.031754,0.043318,0.02767,0.033323
6,"Politics, News & Current Events",0.406314,0.36501,0.373688,0.448295


In [None]:
for i in range(3):
  ch_mtd[i] = ch_mtd[i].fillna(0.0)
  ch_mtd[i].to_csv(f'drive/MyDrive/LinkedInTheDark/MetaTopicAnal/meta_topics_epoch_{i+1}.csv')

n_meta_topic_mapping.to_csv('drive/MyDrive/LinkedInTheDark/MetaTopicAnal/meta_topics_mapping.csv')