In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from Sqlite3Helpers import Sqlite3Functions as s3
import plotly_express as px
import plotly.io as pio
import json
import KeywordSearch

pio.templates.default = 'plotly_white'

In [2]:
conn = s3.connect_to_sqlite3('../../Scraping/db/ghs.db')

In [3]:
def remove_all_spaces(txt):
    return ''.join(txt.split())


def search(msg, terms_dict):
    msg = remove_all_spaces(msg)
    return [
        (k, [t for t in v if t in msg])
        for k,v in terms_dict.items()
    ]


def search_simple(msg, terms):
    msg = remove_all_spaces(msg)
    return [
        t for t in terms
        if t in msg
    ]


def count_terms_found(terms_found):
    for t in terms_found:
        for i in t:
            yield i[0], len(i[1])

# Get People

In [4]:
people = pd.read_csv('../data/keywords/Resources Combined - 1List of women_politically motivated abuse targets.csv')
people_ids = [c for c in people.columns if c.startswith('A') and len(c)==3]
print('IDs:', people_ids)

people_terms = {}
for pid in people_ids:
    p = people[[c for c in people.columns if c.startswith(pid)]]
    person_abbr = p.iloc[1,1]
    people_terms[person_abbr] = [t.strip() for t in p[pid].dropna() if t!='-']

people_terms

IDs: ['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A20']


{'assk': ['အောင်ဆန်းစုကြည်',
  'Aung San Suu Kyi',
  'assk',
  'suu kyi',
  'su kyi',
  'su',
  'စု',
  'စုကြည်',
  'ဒေါ်စု',
  'အမေစု'],
 'pcl': ['ပန်ဆယ်လို', 'Pencilo', 'ဆယ်လို', 'စပပျံ', 'pancilo', 'အမေပန်'],
 'tzsly': ['သင်ဇာရွန်းလဲ့ရည်', 'Thinzar Shoon Lei Yi', 'ကုလားမ'],
 'etzm': ['အိသဉ္ဇာမောင်',
  'Ei Thinzar Maung',
  'သာကူး',
  'ဝန်ကြီးမမ',
  'ဒုဝန်ကြီး',
  'မမ'],
 'eznb': ['အက်စတာဇေနေ', 'Esther Ze Naw Bamvo', 'ကချင်မ', 'စစ်ဗိုလ်သမီး'],
 'snp': ['စန်နုပန်', 'Seng Nu Pan'],
 'trydn': ['ဒေါ်သီရိရတနာ', 'Daw Thiri Yadanar'],
 'hm': ['ဒေါ်ထုမေ', 'Daw Htoot May'],
 'zma': ['ဒေါ်ဇင်မာအောင်', 'Daw Zin Mar Aung', 'အမေ့သမီး'],
 'wwn': ['ဝေဝေနု', 'Wai Wai Nu'],
 'yu': ['<to-add>', 'Yasmin Ullah'],
 'msp': ['မေစံပယ်ဖြူ', 'May Sabe Phyu'],
 'css': ['ချောစုစံ', 'Chaw Su San'],
 'amaa': ['အေးမြင့်အောင်အောင်', 'Aye Myint Aung Aung'],
 'nw': ['နော်ဝါးခူးရှီး', 'Naw Wah Ku Shee'],
 'ppt': ['ပိုင်ဖြိုးသု', 'Paing Phyoe Thu'],
 'ttk': ['သက်သက်ခိုင်', 'Thet Thet Khaing'],
 'nshhs': ['နော်ဇူဇန်နာ လ

# Get HS Terms

In [5]:
hsdf = pd.read_csv('../data/keywords/Resources Combined - 2 Search terms specific to women.csv')
hsdf.dropna(subset=['BURMESE TERM'], inplace=True)
hs_terms = hsdf['BURMESE TERM'].loc[hsdf['Term Targets Woman']].tolist()
hs_terms

['ဖာစုကြည်',
 'ဖာဆယ်လို',
 'ကုလားမယား',
 'နီပိတ်မ',
 'ခံဆယ်လို',
 'မွတ်စု',
 'ဘောပြားမ',
 'မုဆိုးမ',
 'ကုလားမ ',
 'ကောင်မ',
 'နို့ပြားမ',
 'စပပျံမ',
 '$ကောင်မ',
 'နီပေါမ',
 'လုံခြည်',
 'လင်ဒရူးမ']

# Load and Process in Loop

## Algo

- Get the total number of rows.
- Define chunksize which will be the size of each batch.

In [6]:
NROWS = int(s3.fetch(conn, 'SELECT COUNT("index") FROM etl_clean').iloc[0,0])
print(NROWS)

1338107


In [24]:
# Delete `hs`
s3.q(con=conn, query='''
DROP TABLE IF EXISTS hs;
''')

index_start = 0
chunksize = 100000

while index_start <= NROWS:
    index_end = index_start + chunksize - 1  # -1 because sql between is inclusive
    print(f'Processing index={index_start}:{index_end}')

    df = s3.fetch(conn,
        f'SELECT post_url, datetime_posted, msg_clean msg, msg_seg FROM etl_clean WHERE "index" BETWEEN {index_start} AND {index_end};')
    df['msg_seg'] = df.msg_seg.apply(json.loads)
    df['datetime_posted'] = pd.to_datetime(df.datetime_posted)

    # Find HS
    df = KeywordSearch.find_hs(df, hs_terms)
    people_dict = KeywordSearch.search_persons(df, people_terms)
    df = KeywordSearch.format_df(df, people_dict)
    
    for c in people_dict.keys():
        df[c] = [json.dumps(l, ensure_ascii=False) for l in df[c]]
    df['hs_terms_found'] = [json.dumps(l, ensure_ascii=False) for l in df.hs_terms_found]
    
    s3.insert(table='hs', con=conn, df=df, if_exists='append', chunksize=chunksize)

    index_start = index_end + 1

Processing index=0:99999


100000it [00:41, 2387.80it/s]
100%|██████████| 20/20 [03:23<00:00, 10.16s/it]


None
Processing index=100000:199999


100000it [00:44, 2245.33it/s]
100%|██████████| 20/20 [03:27<00:00, 10.37s/it]


None
Processing index=200000:299999


100000it [00:25, 3894.84it/s]
100%|██████████| 20/20 [02:29<00:00,  7.48s/it]


None
Processing index=300000:399999


100000it [00:42, 2376.55it/s]
100%|██████████| 20/20 [03:21<00:00, 10.07s/it]


None
Processing index=400000:499999


100000it [00:42, 2368.12it/s]
100%|██████████| 20/20 [03:25<00:00, 10.28s/it]


None
Processing index=500000:599999


100000it [00:44, 2227.31it/s]
100%|██████████| 20/20 [03:33<00:00, 10.69s/it]


None
Processing index=600000:699999


100000it [00:48, 2078.90it/s]
100%|██████████| 20/20 [03:37<00:00, 10.86s/it]


None
Processing index=700000:799999


100000it [00:46, 2160.33it/s]
100%|██████████| 20/20 [03:35<00:00, 10.78s/it]


None
Processing index=800000:899999


100000it [00:35, 2781.90it/s]
100%|██████████| 20/20 [03:03<00:00,  9.15s/it]


None
Processing index=900000:999999


100000it [00:34, 2875.08it/s]
100%|██████████| 20/20 [02:56<00:00,  8.85s/it]


None
Processing index=1000000:1099999


100000it [00:33, 2998.12it/s]
100%|██████████| 20/20 [02:51<00:00,  8.57s/it]


None
Processing index=1100000:1199999


100000it [00:30, 3328.55it/s]
100%|██████████| 20/20 [02:41<00:00,  8.08s/it]


None
Processing index=1200000:1299999


100000it [00:35, 2840.16it/s]
100%|██████████| 20/20 [02:57<00:00,  8.86s/it]


None
Processing index=1300000:1399999


38107it [00:13, 2808.83it/s]
100%|██████████| 20/20 [01:09<00:00,  3.46s/it]


None


In [19]:
df

Unnamed: 0,date,time,post_url,message,hs_terms_found,AungSanSuuKyi,Pencilo,ThinzarShoonLeiYi,EiThinzarMaung,EstherZeNawBamvo,...,nMaySabePhyu,nChawSuSan,nAyeMyintAungAung,nNawWahKuShee,nPaingPhyoeThu,nThetThetKhaing,nNawSusannaHlaHlaSoe,nNawMayOo,nHtarHtetHtet,nHsTermsFound
0,2022-08-03,01:27:41,https://t.me/justicseeker/7574,အကြိမ် ၁ သန်းမှာ တစ်ကြိမ်ပဲ ရနိုင်တဲ့ ဓာတ်ပုံရ...,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
1,2022-08-03,01:10:06,https://t.me/justicseeker/7573,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
2,2022-08-03,01:09:46,https://t.me/justicseeker/7572,[ အပြစ် ]ဟု မိန့်တော်မူသည် … ။<newline>ရပ်ပြစ်...,[],"[""စု""]",[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
3,2022-08-03,01:09:46,https://t.me/justicseeker/7571,🔴 ဩကာသ ရှင်းတမ်း 🙏🙏🙏<newline>ဗုဒ္ဓဘာသာ မှန်လျှ...,[],"[""စု""]",[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
4,2022-08-03,01:00:41,https://t.me/justicseeker/7570,reactive respond သည် သဘာဝဘေးအန္တရာယ်ကျရောက်ပြီ...,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2022-07-31,18:58:41,https://t.me/justicseeker/7479,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
96,2022-07-31,18:58:41,https://t.me/justicseeker/7478,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
97,2022-07-31,18:58:35,https://t.me/justicseeker/7477,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
98,2022-07-31,18:58:35,https://t.me/justicseeker/7476,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0


In [8]:
a, b = 100, 199
print(f'Processing index={a}:{b}')
df = s3.fetch(conn,
    f'SELECT post_url, datetime_posted, msg_clean msg, msg_seg FROM etl_clean WHERE "index" BETWEEN {a} AND {b};')
df['msg_seg'] = df.msg_seg.apply(json.loads)
df['datetime_posted'] = pd.to_datetime(df.datetime_posted)

# Find HS
df = KeywordSearch.find_hs(df, hs_terms)
people_dict = KeywordSearch.search_persons(df, people_terms)
df = KeywordSearch.format_df(df, people_dict)

Processing index=100:199


100it [00:00, 3737.87it/s]
100%|██████████| 20/20 [00:00<00:00, 126.92it/s]


Unnamed: 0,date,time,post_url,message,hs_terms_found,AungSanSuuKyi,Pencilo,ThinzarShoonLeiYi,EiThinzarMaung,EstherZeNawBamvo,...,nMaySabePhyu,nChawSuSan,nAyeMyintAungAung,nNawWahKuShee,nPaingPhyoeThu,nThetThetKhaing,nNawSusannaHlaHlaSoe,nNawMayOo,nHtarHtetHtet,nHsTermsFound
0,2022-07-31,18:58:35,https://t.me/justicseeker/7474,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
1,2022-07-31,18:58:35,https://t.me/justicseeker/7473,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
2,2022-07-31,18:58:35,https://t.me/justicseeker/7472,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
3,2022-07-31,18:58:35,https://t.me/justicseeker/7471,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
4,2022-07-31,18:58:34,https://t.me/justicseeker/7470,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2022-07-30,17:00:20,https://t.me/justicseeker/7379,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
96,2022-07-30,17:00:20,https://t.me/justicseeker/7378,ရန်ကုန်တိုင်းဒေသကြီး အလုံမြိုနယ်<newline>လွှတ်...,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
97,2022-07-30,16:58:11,https://t.me/justicseeker/7377,,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0
98,2022-07-30,16:58:11,https://t.me/justicseeker/7376,ရွှေဘတောင် လက်မလည်တော့ ဒီနေ့ ထောင်ထမင်းစားချင်...,[],[],[],[],[],[],...,0,0,0,0,0,0,0,0,0,0


In [22]:
print(s3.insert(table='hs', con=conn, df=df, if_exists='append', chunksize=chunksize))

None
