In [1]:
import glob
import pandas as pd
import gzip
import os
from src.creds import drive_root

# MEX

In [2]:

mex_root = drive_root + "MEX/"
fns = []
for fn in glob.glob(mex_root+'??????/*'):
    fns.append(fn)
df = pd.DataFrame(fns, columns=['fn'])
df['fn-no-pre'] = df.fn.apply(lambda x: x.replace(mex_root, ''))
df['month'] = df['fn-no-pre'].apply(lambda x: x.split('/')[0])
df['fn-no-date'] = df['fn-no-pre'].apply(lambda x: x.split('/')[1].rsplit('_', 1)[0])

df_gp = df.groupby(['month', 'fn-no-date']).size()
# df_gp.to_csv('data/MEX_structure.csv')

# GUAT

In [2]:
gual_root = drive_root + 'GUAT/'


## YYYYMM/cells.dat.gz and guatemala/cells.dat.gz are the same 

In [3]:
cells = []
for m in range(2, 9):
    folder = '20140%d' % m
    path = gual_root + '%s/cells.dat' % folder
    if os.path.exists(path):
        with open(path, 'rb') as f:
            cells.append(f.read())
    else:
        with gzip.open(path + '.gz', 'rb') as f:
            cells.append(f.read())

first = cells[0]
for i, c in enumerate(cells):
    print(i, c==first)

0 True
1 True
2 True
3 True
4 True
5 True
6 True


In [4]:
cell_path = gual_root + 'guatemala/cells.dat.gz'
with gzip.open(cell_path, 'r') as f:
    cell = f.read()
cell==first

True

## number of columns for calls, mms, sms, subs and tele

In [None]:
for prefix in ['calls', 'mms', 'sms']:
    print('----------------- ' + prefix)
    objs = []

    for date in ['20140201', '20140207', '20140302', '20140419']:
        path = gual_root + 'guatemala/%s%s.dat.gz'% (prefix, date)
        with gzip.open(path, 'r') as f:
            lines = []
            for i, line in enumerate(f):
                lines.append(line.split(b'|'))
                if i>=100:
                    break
            objs.append(lines)
    
    for lines in objs:
        print(pd.DataFrame(lines).shape)

----------------- calls
(101, 15)
(101, 15)
(101, 15)
(101, 15)
----------------- mms
(101, 10)
(101, 10)
(101, 10)
(101, 10)
----------------- sms
(101, 10)
(101, 10)
(101, 10)
(101, 10)


In [3]:
subs = []
for date in ['20140228', '20140331', '20140430', '201405', '201406','201407', '201408']:
    print(date)
    path = gual_root + 'guatemala/subscribers%s.dat.gz'% (date)
    with gzip.open(path, 'r') as f:
        lines = []
        for i, line in enumerate(f):
            lines.append(line.split(b'|'))
            if i > 100:
                break
    subs.append(pd.DataFrame(lines))
    

20140228
20140331
20140430
201405
201406
201407
201408


In [40]:
path = gual_root + 'guatemala/telefono_55010033.txt'

with open(path, 'rb') as f:
    tele = f.readlines()

tele = pd.DataFrame([t.split(b'|') for t in tele])

In [39]:
tele.shape

(85110, 15)

## termination code

In [11]:
from collections import Counter

In [14]:
for prefix in ['calls', 'mms', 'sms']:
    print('----------------- ' + prefix)

    for date in ['20140201', '20140207', '20140302', '20140419']:
        path = gual_root + 'guatemala/%s%s.dat.gz'% (prefix, date)
        with gzip.open(path, 'r') as f:
            codes = []
            roaming = []
            for i, line in enumerate(f):
                tokens = line.split(b'|')
                codes.append(tokens[-3])
                roaming.append(tokens[-1])

        print(date, Counter(codes), Counter(roaming))

----------------- calls
20140201 Counter({b'': 3235985}) Counter({b'NOROAMI\n': 3235985})
20140207 Counter({b'': 7827872}) Counter({b'NOROAMI\n': 7827872})
20140302 Counter({b'': 6264827}) Counter({b'NOROAMI\n': 6264827})
20140419 Counter({b'': 5037925}) Counter({b'NOROAMI\n': 5037925})
----------------- mms
20140201 Counter({b'': 2032}) Counter({b'\n': 2032})
20140207 Counter({b'': 2003}) Counter({b'\n': 2003})
20140302 Counter({b'': 1682}) Counter({b'\n': 1682})
20140419 Counter({b'': 1423}) Counter({b'\n': 1423})
----------------- sms
20140201 Counter({b'': 7654134}) Counter({b'NOROAMI\n': 7654134})
20140207 Counter({b'': 13708773}) Counter({b'NOROAMI\n': 13708773})


IndexError: list index out of range

In [16]:
for prefix in ['calls', 'mms', 'sms'][2:]:
    print('----------------- ' + prefix)

    for date in ['20140201', '20140207', '20140302', '20140419'][2:]:
        path = gual_root + 'guatemala/%s%s.dat.gz'% (prefix, date)
        with gzip.open(path, 'r') as f:
            codes = []
            roaming = []
            for i, line in enumerate(f):
                tokens = line.split(b'|')
                try:
                    codes.append(tokens[-3])
                    roaming.append(tokens[-1])
                except IndexError:
                    print(len(tokens), tokens)

        print(date, Counter(codes), Counter(roaming))

----------------- sms
2 [b'53940384', b'533891']
20140302 Counter({b'': 9366360}) Counter({b'NOROAMI\n': 9366360})
20140419 Counter({b'': 9221430}) Counter({b'NOROAMI\n': 9221430})


# COL

In [48]:
def read_top_lines(path, sep=b'|', top=100):
    with gzip.open(path, 'r') as f:
        lines = []
        for i, line in enumerate(f):
            lines.append(line.split(sep))
            if i > top:
                break
    return lines

In [41]:
col_root =  drive_root + 'COL/'

In [45]:
cdr2014_dir = col_root + 'COlombiaCDR2014-01-08/'

## cells are the same each month

In [53]:
prefix = 'CELLS'
cells = []
for date in ['201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408']:
    path = cdr2014_dir + '%s_%s.dat.gz' %(prefix, date)
    with gzip.open(path, 'rb') as f:
        cells.append(f.read())

first = cells[0]
for i, c in enumerate(cells):
    print(i, c==first)
    


pd.DataFrame([f.split(b'|') for f in first.split(b'\r\n')]).shape

0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True


In [61]:
for prefix in ['CLIENTES', 'CLIENTES_OK', 'MMS', 'SMS']:
    print('-------------'+prefix)
    for date in ['201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408']:
        path = cdr2014_dir + '%s_%s.dat.gz' %(prefix, date)
        lines = read_top_lines(path)
        
        print(date, pd.DataFrame(lines).shape)

-------------CLIENTES
201401 (102, 5)
201402 (102, 5)
201403 (102, 5)
201404 (102, 5)
201405 (102, 5)
201406 (102, 5)
201407 (102, 5)
201408 (102, 5)
-------------CLIENTES_OK
201401 (102, 5)
201402 (102, 5)
201403 (102, 5)
201404 (102, 5)
201405 (102, 5)
201406 (102, 5)
201407 (102, 5)
201408 (102, 5)
-------------MMS
201401 (0, 0)
201402 (0, 0)
201403 (0, 0)
201404 (0, 0)
201405 (0, 0)
201406 (0, 0)
201407 (0, 0)
201408 (0, 0)
-------------SMS
201401 (102, 10)
201402 (102, 10)
201403 (102, 10)
201404 (102, 10)
201405 (102, 10)
201406 (102, 10)
201407 (102, 10)
201408 (102, 10)
