In [1]:
import traitlets
import os
import pandas as pd
import logging
import random
LOG_FILE = '../data/log/etl.log'

def initLog():
    handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes = 1024*1024, backupCount = 1000)
    fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
    formatter = logging.Formatter(fmt)   # 实例化formatter  
    handler.setFormatter(formatter)      # 为handler添加formatter  

    logger = logging.getLogger('tst')    # 获取名为tst的logger  
    logger.addHandler(handler)           # 为logger添加handler  
    logger.setLevel(logging.DEBUG)  
    return logger

logger = initLog()
logger.setLevel(logging.DEBUG)

In [2]:
import sys 
sys.path.append('libs/')
from table_v3 import TableChart

In [43]:
class CsvData(traitlets.HasTraits):
    
    _supportType = ['float32','float64','float', 'int','int8','int16','int32','int64', 'bool', 'datetime64','timedelta','category','object']
    _datapath = traitlets.Unicode()
    _samples = 10000
    _sep = ','
    _header = 0
    _codec = None
    #批次读取大小
    _chunksize = 1000
    _cachepath = '../data/cache/'
    _meta = None
    _df = pd.DataFrame()
    _data_change=traitlets.Float()
       
    def __init__(self,datapath,samples=10000,sep=',',header=0,codec='utf8', chunksize=1000,usecache=True,cachepath=None):
        if samples and isinstance(samples,int) and (samples>0):
            self._samples = samples
        if chunksize and isinstance(chunksize,int) and (chunksize>0):
            self._chunksize = chunksize
        if header and isinstance(header,int) and (header>0):
            self._header = header
        if isinstance(usecache,bool):
            self._usecache = usecache
        if sep:
            self._sep = sep
            print(self._sep,sep)
        if codec:
            self._codec = codec
        if os.path.isfile(datapath):
            self._datapath = datapath
            self._linetotal = self.estimateLen()
        if cachepath and os.path.isdir(cachepath):
            self._cachepath = cachepath

    def loadSamples(self,samples=None,usecache=True,cachename=None):
        logger.debug('begin loading sample')
        if isinstance(usecache,bool):
            self._usecache = usecache
        if not(self._usecache):
            self.takeSamples()
        else:
            if cachename :
                filename = cachename
            else:
                name = os.path.splitext(os.path.split(self._datapath)[1])[0]
                filename = os.path.join(self._cachepath,name+'.hd5')
            try:
                self._df = pd.read_hdf(filename)
                logger.info('success in loading cache from file:{}'.format(filename))
            except Exception as err:
                logger.exception('fail in loading cache from file:{}\nbegin taking {} samples,\nbecause '.format(filename,self._samples,str(err)))              
                self.takeSamples()
        self.dataRefreshed()
        return self

    
    def takeSamples(self,samples=None):
        logger.debug('begin taking sample')
        if samples and isinstance(samples,int) and (samples>0):
            self._samples = samples
        print(self)
        chunker = pd.read_csv(self._datapath,sep=self._sep,header=self._header,chunksize=self._chunksize,encoding=self._codec)
        sampleSize = round(self._samples / (self._linetotal / self._chunksize))
        for piece in chunker:
            self._df = pd.concat([self._df,piece.sample(n=sampleSize)])
        realSize = self._df.shape[0]
        last = self._samples - realSize
        self._df = pd.concat([self._df,piece.sample(n=last)])
        if self._usecache:
            self.cacheSamples()
        self.object2category()
        return self
                     
    def cacheSamples(self):
        logger.debug('begin caching sample')
        name = os.path.splitext(os.path.split(self._datapath)[1])[0]
        filename = os.path.join(self._cachepath,name+'.hd5')
        if os.access(self._cachepath, os.W_OK):
            if not(os.path.isfile(filename)) or os.access(filename,os.W_OK):
                self._df.to_hdf(filename,'default')
            else:
                raise Exception('can not create file:{}'.format(filename))
        return self
    
        
    def estimateLen(self):
        with open(self._datapath) as file:
            size = 0
            for i,line in enumerate(file):
                size = size +len(line)
                if i >150:
                    break
            lineSize = size /150
            fileSize = os.path.getsize(self._datapath)
            return int(fileSize / lineSize)

   
    #对object类型的列，若唯一值占比小于50%，则转化为category类型，以节约存储
    def object2category(self):
        num_total = self._df.shape[0]
        for col in self._df:
            if self._df[col].dtype == 'object':
                num_unique = len(self._df[col].unique())                
                if num_unique < num_total *0.5:
                    self._df[col] = self._df[col].astype('category')
        self.dataRefreshed()
        return self

    def convertType(self,col,typename):
        if typename not in self._supportType:
            raise Exception('type must be in list[{}]'.format(','.join( self._supportType)))
        if col not in self._df:
            raise Exception('col[{}] is not a column of the data'.format(col))
        self._df[col] = self._df[col].astype(typename,errors='ignore')
        self.dataRefreshed()
        return self
    
    @property
    def header(self):
        return self._df.columns.values.tolist()
    
    @header.setter
    def header(self,header=[]):
        self._df.columns = header
        self.dataRefreshed()
    
    @property
    def data(self):
        return self._df.values.tolist()
    
    #数据更新标识
    def dataRefreshed(self):
        self._data_change = random.random()
        

In [44]:
class EtlManager():
    
    _data = None
    _output = None
    _show = False
    
    def __init__(self,path):
        self._data = self.loadData(path)
        self._output = self.initTable()
        self._data.observe(self.eventDatachanged,'_data_change')
        
    def eventDatachanged(self,change):
        self.refreshOutput()
    
    def refreshOutput(self):
        self._output.model.header = self._data.header
        self._output.model.data = self._data.data
    
    def loadData(self,path,samples=30,chunksize=11,usecache=True):
        data = CsvData(path,samples=samples,sep=',',chunksize=chunksize,usecache=usecache)
        data.loadSamples()
        return data
    
    def initTable(self):
        return TableChart(self._data.header,self._data.data)
    
    def run(self):
        while(True):
            self.cmdPrompt()
        
    def cmdPrompt(self):
        pass

    def showTable(self,enable=True):
        if isinstance(enable,bool):
            self._show = enable
        else:
            raise Exception('display must be bool value')
        if self._show:
            self.show_handle = display(self._output)
        else:
            self._output.close()
            self._show = False

In [45]:
m = EtlManager('../data/iris.csv')
m.showTable()

, ,
<__main__.CsvData object at 0x7febe0bc5080>


A Jupyter Widget