In [12]:
import tarfile
import os
import sys
import logging
import re
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
from shapely.geometry import LineString, Polygon
from PIL import Image, ImageDraw

In [20]:
def extract(tar_path, target_path = 'tmp', mode = 'r:gz', logger = None):
    """
    input:
        tar_path string:
            path of tar file
        target_path string:
            path of target file
        mode string:
            mode of tar 
    """
    try:
        tar = tarfile.open(tar_path, mode)
        file_names = tar.getnames()
        # make sure that only two files contain
        #assert(len(file_names) == 2)
        # make sure the name of the files is right
        #TODO
        confs = []
        results = []
        if logger:
            logger.info('start extracting data from file: '+tar_path)
        file_names = [li for li in file_names if li.split('.')[-1] == 'txt']
        # logging start info
        for file_name in file_names:
            f = tar.extractfile(file_name)
            if 'configuration' in file_name:
                # use the configuration filename to get the result filename
                result_filename = file_name[:-18]+'.txt'
                # make sure the result file exist
                if result_filename not in file_names:
                    if logger:
                        logger.info('result file not found')
                    continue
                # extract the configuration file 
                f = tar.extractfile(file_name)
                conf = extract_configuration(file_name, f)
                # extract the result file
                f = tar.extractfile(result_filename)
                result = extract_result(file_name, f)
                # if mismatch 
                if len(conf) == len(result):
                    confs.append(conf)
                    results.append(result)
                elif len(conf) > len(result):
                    conf = conf[:len(result)]
                    confs.append(conf)
                    results.append(result)
                else:
                    if logger:
                        logger.error('Data mismatch in:'+ file_name)
                    else:
                        print('Data mismatch in:'+ file_name)
        tar.close()
        # combine conf and result, works only when all the task have same long 
        conf = pd.concat(confs, axis = 0)
        result = pd.concat(results, axis = 0)
        target = pd.concat([conf, result], axis = 1)
        if logger:
            logger.info('end extracting')
        return target
    except:
        raise

def extract_configuration(filename, f):
    """
    input:
        filename string:
            the name of the file, only works when the filename doesn't contain other number
        f file object:
            the file object
    """
    conf_columns = ['Jobid', 'Rot', 'List of Coordinates', 'Shapely Polygon']
    conf = pd.DataFrame(columns = conf_columns)
    for row_index, line in enumerate(f):
        l = line.decode("utf-8").strip()
        l = re.sub(r'\s+', ' ', l)
        ws = l.split(' ')
        assert(len(ws)>=2)
        conf.loc[row_index, 'Jobid'] = re.sub('[a-zA-Z/.]', '', filename)+str(row_index)
        conf.loc[row_index, 'Rot'] = ws[0]
        tmp_coor = fix_coordinate(ws[1:])
        conf.loc[row_index, 'List of Coordinates'] = tmp_coor
        conf.loc[row_index, 'Shapely Polygon'] = Polygon(tmp_coor)
    return conf

def extract_result(filename, f):
    """
    input:
        filename string:
            the name of the file
        f file object:
            the file object    
    """
    result_columns = ['Metric1', 'Metric2', 'Metric3', 'Metric4']
    result = pd.DataFrame(columns = result_columns)
    for row_index, line in enumerate(f):
        l = line.decode("utf-8").strip()
        l = re.sub(r'\s+', ' ', l)
        ws = l.split(' ')
        if len(ws) != 6:
            print(ws)
        result.loc[row_index, 'Metric1'] = ws[2]
        result.loc[row_index, 'Metric2'] = ws[3]
        result.loc[row_index, 'Metric3'] = ws[4]
        result.loc[row_index, 'Metric4'] = ws[5]
    return result

def fix_coordinate(ws, logger = None):
    out = []
    out2 = []
    for w in ws:
        if w.count('.') == 1:
            out.append(float(w))
        elif w.count('.') > 1:
            # fix the problem
            tmp = []
            get_first(w, tmp)
            out.extend(tmp)
        else:
            # error??
            if logger:
                logger.error('coordinate error')
    if len(out)%2 != 0:
        logger.error('coordinate error')
    for i in range(0,len(out),2):
        out2.append((out[i],out[i+1]))
    return out2



# Main

In [23]:
# set logging
logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler = logging.FileHandler('../log/extract_data.log')
handler.setLevel(logging.INFO)
formater = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formater)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
#logger.info('This is a log info')

In [14]:
# 
# read filename
filenames = os.listdir('../data/data_from_sample_6/')
filenames = [li for li in filenames if li.split('.')[-1] == 'tgz']
preffix = '../data/data_from_sample_6/'
filenames = [preffix+li for li in filenames]

In [26]:
# extract file
data = []
for filename in filenames[:1000]:
    data.append(extract(filename, logger = logger))
data = pd.concat(data, axis = 0)

2019-09-25 17:34:47,571 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2535_results.tgz
2019-09-25 17:34:56,113 - __main__ - INFO - end extracting
2019-09-25 17:34:56,155 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2065_results.tgz
2019-09-25 17:35:03,723 - __main__ - INFO - end extracting
2019-09-25 17:35:03,759 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2913_results.tgz
2019-09-25 17:35:10,659 - __main__ - INFO - end extracting
2019-09-25 17:35:10,679 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2781_results.tgz
2019-09-25 17:35:16,729 - __main__ - INFO - end extracting
2019-09-25 17:35:16,776 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2871_results.tgz
2019-09-25 17:35:24,094 - __main__ - INFO - end extracting
2019-09-25 17:35:24,120 - __main__ - INFO - start extracting data from

2019-09-25 17:40:03,988 - __main__ - INFO - end extracting
2019-09-25 17:40:04,027 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2843_results.tgz
2019-09-25 17:40:10,118 - __main__ - INFO - end extracting
2019-09-25 17:40:10,138 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2465_results.tgz
2019-09-25 17:40:16,107 - __main__ - INFO - end extracting
2019-09-25 17:40:16,127 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2135_results.tgz
2019-09-25 17:40:23,381 - __main__ - INFO - end extracting
2019-09-25 17:40:23,403 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-221_results.tgz
2019-09-25 17:40:31,120 - __main__ - INFO - end extracting
2019-09-25 17:40:31,140 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2672_results.tgz
2019-09-25 17:40:37,181 - __main__ - INFO - end extracting
2019-09-25 1

2019-09-25 17:45:09,025 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2649_results.tgz
2019-09-25 17:45:17,951 - __main__ - INFO - end extracting
2019-09-25 17:45:17,977 - __main__ - INFO - start extracting data from file: ../data/data_from_sample_6/35126-2319_results.tgz


KeyboardInterrupt: 

In [30]:
data['string'] = data['List of Coordinates'].map(lambda x: str(x))

In [33]:
li_p = data['string'].unique()

In [32]:
data.head()

Unnamed: 0,Jobid,Rot,List of Coordinates,Shapely Polygon,Metric1,Metric2,Metric3,Metric4,len,string
0,35126-2535_10_0,0.0,"[(202.0, 0.0), (0.0, 0.0), (0.0, 40.0), (101.0...","POLYGON ((202 0, 0 0, 0 40, 101 87, 202 40, 20...",1.0,1.0,1.0,4.0,9,"[(202.0, 0.0), (0.0, 0.0), (0.0, 40.0), (101.0..."
1,35126-2535_10_1,90.0,"[(117.4, 156.1), (117.4, 108.0), (78.2, 108.0)...","POLYGON ((117.4 156.1, 117.4 108, 78.2 108, 78...",2.0,1.0,206.0,4.0,26,"[(117.4, 156.1), (117.4, 108.0), (78.2, 108.0)..."
2,35126-2535_10_2,90.0,"[(1.0, 123.3), (0.0, 122.3), (0.0, 93.3), (1.0...","POLYGON ((1 123.3, 0 122.3, 0 93.3, 1 92.3, 21...",3.0,1.0,365.0,4.0,54,"[(1.0, 123.3), (0.0, 122.3), (0.0, 93.3), (1.0..."
3,35126-2535_10_3,90.0,"[(137.0, 61.0), (136.0, 61.0), (136.0, 62.0), ...","POLYGON ((137 61, 136 61, 136 62, 136 62, 136 ...",4.0,1.0,491.0,4.0,29,"[(137.0, 61.0), (136.0, 61.0), (136.0, 62.0), ..."
4,35126-2535_10_4,0.0,"[(137.0, 61.0), (136.0, 61.0), (136.0, 62.0), ...","POLYGON ((137 61, 136 61, 136 62, 136 62, 136 ...",5.0,1.0,624.0,4.0,29,"[(137.0, 61.0), (136.0, 61.0), (136.0, 62.0), ..."


In [55]:
pols = []
for i in li_p:
    pols.append(data[data['string'] == i].iloc[[0]])

In [56]:
pols = pd.concat(pols).reset_index().iloc[:, 1:]

In [57]:
pols = pols[['string', 'List of Coordinates']]

In [54]:
def draw_poly(x):
    img = Image.new('L', (128, 350), 0)
    ImageDraw.Draw(img).polygon(x, outline = 1, fill = 255)
    mask = np.array(img)
    return mask

In [67]:
pols['matrix'] = pols['List of Coordinates'].map(lambda x: draw_poly(x))

In [68]:
pols

Unnamed: 0,string,List of Coordinates,matrix
0,"[(202.0, 0.0), (0.0, 0.0), (0.0, 40.0), (101.0...","[(202.0, 0.0), (0.0, 0.0), (0.0, 40.0), (101.0...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
1,"[(117.4, 156.1), (117.4, 108.0), (78.2, 108.0)...","[(117.4, 156.1), (117.4, 108.0), (78.2, 108.0)...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,"[(1.0, 123.3), (0.0, 122.3), (0.0, 93.3), (1.0...","[(1.0, 123.3), (0.0, 122.3), (0.0, 93.3), (1.0...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,"[(137.0, 61.0), (136.0, 61.0), (136.0, 62.0), ...","[(137.0, 61.0), (136.0, 61.0), (136.0, 62.0), ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,"[(166.0, 112.0), (166.0, 0.0), (0.0, 0.0), (0....","[(166.0, 112.0), (166.0, 0.0), (0.0, 0.0), (0....","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
5,"[(170.0, 154.0), (0.0, 154.0), (0.0, 154.0), (...","[(170.0, 154.0), (0.0, 154.0), (0.0, 154.0), (...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
6,"[(122.0, 136.0), (122.0, 134.0), (120.0, 134.0...","[(122.0, 136.0), (122.0, 134.0), (120.0, 134.0...","[[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
7,"[(0.0, 4.0), (0.0, 158.0), (4.0, 158.0), (5.0,...","[(0.0, 4.0), (0.0, 158.0), (4.0, 158.0), (5.0,...","[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
8,"[(14.0, 8.0), (26.0, 8.0), (27.0, 7.0), (28.0,...","[(14.0, 8.0), (26.0, 8.0), (27.0, 7.0), (28.0,...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
9,"[(191.0, 68.0), (195.0, 64.0), (195.0, 16.0), ...","[(191.0, 68.0), (195.0, 64.0), (195.0, 16.0), ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


# create metrix from points

In [139]:
def draw_poly(x, inner = None):
    img = Image.new('L', (350, 128), 0)
    ImageDraw.Draw(img).polygon(x, outline = 1, fill = 255)
    if inner:
        ImageDraw.Draw(img).polygon(inner, outline = 1, fill = 0)
    mask = np.array(img)
    return mask, img

In [140]:
ratio = 1
with open('../data/points.txt', 'r') as f:
    masks = []
    for row_index, line in enumerate(f):
        #l = line.decode("utf-8").strip()
        l = line.strip()
        l = re.sub(r'\s+', ' ', l)
        ws = l.split(';')
        ws = [w.strip() for w in ws]
        ws = [(int(float(w.split(' ')[0])/ratio), int(float(w.split(' ')[1]))/ratio) if len(w.split(' ')) > 1 else '--' for w in ws ]
        index = -1
        inner = None
        try:
            index = ws.index('--')
        except:
            index = -1
        if index == -1:
            outer = ws
        else:
            outer = ws[:index]
            inner = ws[index+1:]
        if inner:
            mask, img = draw_poly(outer, inner)
        else:
            mask, img = draw_poly(outer)   
        masks.append(mask)