In [9]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from Sqlite3Helpers import Sqlite3Functions as s3
import plotly_express as px
import plotly.io as pio
import json
import KeywordSearch

import configparser
config = configparser.ConfigParser()
config.read('../../Scraping/config.ini')

# Setting configuration values
sqlite3db_name = config['Database']['sqlite3db_name']
print(sqlite3db_name)
pio.templates.default = 'plotly_white'

database.db


In [2]:
conn = s3.connect_to_sqlite3(f'../../Scraping/db/{sqlite3db_name}')

In [3]:
def remove_all_spaces(txt):
    return ''.join(txt.split())


def search(msg, terms_dict):
    msg = remove_all_spaces(msg)
    return [
        (k, [t for t in v if t in msg])
        for k,v in terms_dict.items()
    ]


def search_simple(msg, terms):
    msg = remove_all_spaces(msg)
    return [
        t for t in terms
        if t in msg
    ]


def count_terms_found(terms_found):
    for t in terms_found:
        for i in t:
            yield i[0], len(i[1])

# Get People

In [7]:
people = pd.read_csv('../data/keywords/people.csv')
people_ids = [c for c in people.columns if c.startswith('A') and len(c)==3]
print('IDs:', people_ids)

people_terms = {}
for pid in people_ids:
    p = people[[c for c in people.columns if c.startswith(pid)]]
    person_abbr = p.iloc[1,1]
    people_terms[person_abbr] = [t.strip() for t in p[pid].dropna() if t!='-']

people_terms

IDs: ['A01', 'A02']


{nan: ['လူနာမည်', 'name', 'လူ', 'နာမည်'],
 nan: ['လူနာမည်', 'name', 'လူ', 'နာမည်']}

# Get HS Terms

In [8]:
hsdf = pd.read_csv('../data/keywords/hs_terms.csv')
hsdf.dropna(subset=['BURMESE TERM'], inplace=True)
hs_terms = hsdf['BURMESE TERM'].loc[hsdf['Term Targets Woman']].tolist()
hs_terms

['ကောင်မ']

# Load and Process in Loop

## Algo

- Get the total number of rows.
- Define chunksize which will be the size of each batch.

In [10]:
NROWS = int(s3.fetch(conn, 'SELECT COUNT("index") FROM etl_clean').iloc[0,0])
print(NROWS)

854


In [16]:
# Delete `hs`
s3.q(con=conn, query='''
DROP TABLE IF EXISTS hs;
''')

index_start = 0
chunksize = 100000

while index_start <= NROWS:
    index_end = index_start + chunksize - 1  # -1 because sql between is inclusive
    print(f'Processing index={index_start}:{index_end}')

    df = s3.fetch(conn,
        f'SELECT post_url, datetime_posted, msg_clean msg, msg_seg FROM etl_clean WHERE "index" BETWEEN {index_start} AND {index_end};')
    df['msg_seg'] = df.msg_seg.apply(json.loads)
    df['datetime_posted'] = pd.to_datetime(df.datetime_posted)

    # Find HS
    df = KeywordSearch.find_hs(df, hs_terms)
    people_dict = KeywordSearch.search_persons(df, people_terms)
    df = KeywordSearch.format_df(df, people_dict)
    
    for c in people_dict.keys():
        df[c] = [json.dumps(l, ensure_ascii=False) for l in df[c]]
    df['hs_terms_found'] = [json.dumps(l, ensure_ascii=False) for l in df.hs_terms_found]
    
    s3.insert(table='hs', con=conn, df=df, if_exists='append', chunksize=chunksize)

    index_start = index_end + 1

Processing index=0:99999


854it [00:00, 22739.42it/s]
100%|██████████| 2/2 [00:00<00:00, 14.90it/s]


In [17]:
df

Unnamed: 0,date,time,post_url,message,hs_terms_found,name,nname,nHsTermsFound
1,2022-09-06,02:24:11,https://t.me/telethonchat/502027,none,[],[],0,0
2,2022-09-06,02:01:17,https://t.me/telethonchat/502026,,[],[],0,0
3,2022-09-06,00:47:17,https://t.me/telethonchat/502025,result = await client(functions.contacts.getco...,[],[],0,0
4,2022-09-06,00:45:53,https://t.me/telethonchat/502024,help me pls,[],[],0,0
5,2022-09-06,00:45:42,https://t.me/telethonchat/502023,how i can use v1.24 not v1.25?,[],[],0,0
...,...,...,...,...,...,...,...,...
843,2022-09-01,08:37:31,https://t.me/telethonchat/500956,cant,[],[],0,0
844,2022-09-01,08:37:16,https://t.me/telethonchat/500955,save json encoding=utf-8,[],[],0,0
845,2022-09-01,08:17:12,https://t.me/telethonchat/500954,for example reading the first 30 messages of a...,[],[],0,0
846,2022-09-01,08:11:05,https://t.me/telethonchat/500952,does anyone know if i can read the messages of...,[],[],0,0


In [18]:
a, b = 100, 199
print(f'Processing index={a}:{b}')
df = s3.fetch(conn,
    f'SELECT post_url, datetime_posted, msg_clean msg, msg_seg FROM etl_clean WHERE "index" BETWEEN {a} AND {b};')
df['msg_seg'] = df.msg_seg.apply(json.loads)
df['datetime_posted'] = pd.to_datetime(df.datetime_posted)

# Find HS
df = KeywordSearch.find_hs(df, hs_terms)
people_dict = KeywordSearch.search_persons(df, people_terms)
df = KeywordSearch.format_df(df, people_dict)

Processing index=100:199


100it [00:00, 21452.05it/s]
100%|██████████| 2/2 [00:00<00:00, 111.32it/s]


In [20]:
print(s3.insert(table='hs', con=conn, df=df.applymap(str), if_exists='append', chunksize=chunksize))

None
