In [11]:
# -*- coding: utf-8 -*-
#!/usr/bin/python
import matplotlib.pyplot as plt
import pandas as pd
import dateparser

In [12]:
def log_reader(in_filepath):
    """
    Read stoRM's log file and transform into a list of lines (str)
    
    Recive path (string) to unstructured log file
    Return a list containing where each element is a log's line
    """
    listed_log = []
    
    input_file = open(in_filepath,"r")
    for line in input_file:
        listed_log.append(line.strip())
    input_file.close()
    
    return listed_log

In [13]:
def log_tabler(listed_log):
    """
    Transform a log's list of lines (str) in a dictionary
    
    Recive a list containing where each element is a (stoRM) log's line
    Return a table (dict) where each key is a log's column
    """
    #timestamp is yet to be finished
    date, time_stamp, thread, tipe, token, message = [], [], [], [], [], []
    it = 0
    total = len(listed_log)
    for line in listed_log:
        date.append(line[:18])
        time_stamp.append(dateparser.storm_dtpars(line[:18]))
        thread.append(line.split(" ",4)[3])
        tipe.append(line.split(" ",7)[6])
        token.append(line.split("[",1)[1].split("]",1)[0])
        message.append(line.split(":",3)[3].rstrip().lstrip())
        if it%100000 == 0 :
            print " parsed line {0} of {1} lines".format(it,total)
        if it == total:
            print "END"
        it+=1
        
    log_table = {'DATE':date, 'TIMESTAMP':time_stamp, 'THREAD':thread,\
                 'TYPE':tipe, 'TOKEN':token, 'MESSAGE':message}
    return log_table

In [14]:
def csver(log_table,out_filepath):
    """
    Transform a log (dictionary) in .csv
    
    Recive a table (dict) where each key is a log's column
           a string of the filepath output and file name
    Return None
    Produce a structured .csv file of a stoRM log file
    """
    dataf = pd.DataFrame.from_dict(log_table)
    #P: find out columns order
    #print dataf.columns.tolist()
    
    #P: riarrange columns order
    cols =['DATE', 'TIMESTAMP', 'TYPE','THREAD', 'TOKEN','MESSAGE']
    dataf = dataf[cols]
    
    #print dataf.describe()
    dataf.to_csv(out_filepath + '.csv', index=False)

In [15]:
def csver_small(log_table,out_filepath,start, end):
    """
    Transform a log slice (dctionary) in .csv
    
    Recive a table (dict) where each key is a log's column
           a string of the filepath output and file name
    Return None
    Produce a structured .csv file of a stoRM log file
    """
    dataf = pd.DataFrame.from_dict(log_table)
    #P: find out columns order
    #print dataf.columns.tolist()
    
    #P: riarrange columns order
    cols =['DATE', 'TIMESTAMP', 'TYPE','THREAD', 'TOKEN','MESSAGE']
    dataf = dataf[cols]
    
    #print dataf.describe()
    dataf[start:end].to_csv(out_filepath + '.csv', index=False)

In [16]:
def msger(log_table,filepath):
    """
    Transform a log's dictionary in msgpack 
    
    Recive a table (dict) where each key is a log's column
           a string of the filepath output and file name
    Return None
    Produce a msgpack file of a stoRM log file
    """
    dataf = pd.DataFrame.from_dict(log_table)
    cols = ['DATE', 'TIMESTAMP', 'TYPE','THREAD', 'TOKEN','MESSAGE']
    dataf = dataf[cols]
    dataf.to_msgpack(filepath + '.msg')

In [17]:
def msger_small(log_table,filepath,start,end):
    """
    Transform a log's slice dictionary in msgpack 
    
    Recive a table (dict) where each key is a log's column
           a string of the filepath output and file name
    Return None
    Produce a msgpack file of a stoRM log file
    """
    dataf = pd.DataFrame.from_dict(log_table)
    cols = ['DATE', 'TIMESTAMP', 'TYPE','THREAD', 'TOKEN','MESSAGE']
    dataf = dataf[cols]
    dataf[start:end].to_msgpack( filepath + '_small' + '.msg')

#### ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [18]:
%time msger_small(log_tabler(log_reader("log_non_strutturati/storm-frontend-server.log-20180901")),"log_non_strutturati/storm-frontend-server.log-20180901",0,10000)

 parsed line 0 of 16727293 lines
 parsed line 100000 of 16727293 lines
 parsed line 200000 of 16727293 lines
 parsed line 300000 of 16727293 lines
 parsed line 400000 of 16727293 lines
 parsed line 500000 of 16727293 lines
 parsed line 600000 of 16727293 lines
 parsed line 700000 of 16727293 lines
 parsed line 800000 of 16727293 lines
 parsed line 900000 of 16727293 lines
 parsed line 1000000 of 16727293 lines
 parsed line 1100000 of 16727293 lines
 parsed line 1200000 of 16727293 lines
 parsed line 1300000 of 16727293 lines
 parsed line 1400000 of 16727293 lines
 parsed line 1500000 of 16727293 lines
 parsed line 1600000 of 16727293 lines
 parsed line 1700000 of 16727293 lines
 parsed line 1800000 of 16727293 lines
 parsed line 1900000 of 16727293 lines
 parsed line 2000000 of 16727293 lines
 parsed line 2100000 of 16727293 lines
 parsed line 2200000 of 16727293 lines
 parsed line 2300000 of 16727293 lines
 parsed line 2400000 of 16727293 lines
 parsed line 2500000 of 16727293 lines
 

(16ML lines log)<br>
csver => CPU times: user 11min 48s, sys: 17.7 s, total: 12min 6s Wall time: 12min 10s <br> msper => CPU times: user 10min 44s, sys: 18.7 s, total: 11min 2s Wall time: 11min 10s
<br>
<span style='color:red'> Un grosso gap da risolvere è che comunque anche per creare log strutturati parziali con il codice fatto in questo modo noi ci parsiamo prima tutto il codice, **anche perche ci giochiamo facile 14gb di ram**, il che non è esatamente una cosa efficente. In un momento del tempo cambia questa cosa. </span>

test per importazione

In [None]:
def msg_to_df(file_path):
    storm_df = pd.read_msgpack(file_path)
    return storm_df

In [None]:
def csv_to_df(file_path):
    storm_df = pd.read_csv(file_path)
    return storm_df

In [None]:
%time a = msg_to_df("/home/gabriele/Documenti/storm-frontend-server.log-20180901.msg")

In [None]:
%time b = csv_to_df("/home/gabriele/Documenti/storm-frontend-server.log-20180901.csv",d)

In [None]:
%reset