In [23]:
import pandas as pd
import numpy as np
import csv
import itertools
from datetime import datetime
import glob
import matplotlib.pyplot as plt

In [24]:
def read_original_df(rute:str = 'dataset/Export_Time.csv') -> pd.DataFrame:
    list_rows = []
    list_rows_with_data = []
    format_string = '%m/%d/%Y %I:%M:%S %p'
    with open(rute, newline='\n') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter=',')
        for i,row in enumerate(csvreader):
            filters = row.get('Point Path').split('\\')
            if row.get('Samples') == '0' or row.get('Data') == None:
                list_rows_with_data.append(row)
            else:
                row['DTS_new'] = datetime.strptime(row.pop('DTS'), format_string)
                row['point'] =  filters[7]
                data = [row.pop('Data')]
                row['newData'] = row.pop(None)
                row['newData'] = list(itertools.chain(data, row['newData']))
                row['newData'] = [float(item) for item in row['newData']]
                list_rows.append(row)
    df = pd.DataFrame(list_rows)
    df = pd.concat([df.drop('newData', axis=1), pd.DataFrame(df['newData'].tolist())], axis=1)
    
    return list_rows

In [25]:
def chunk_df(list_rows: pd.DataFrame, chunks:int = 6) -> None:
    df_chunks:list[pd.DataFrame] = np.array_split(list_rows,chunks)
    for i,chunk in enumerate(df_chunks):
        chunk.to_csv(f'dataset/chunks/Export_Time_{i}.csv',index=False,sep=',')

In [26]:
def read_chunk_data(search_str:str = 'dataset/chunks/Export_Time_*.csv') -> pd.DataFrame:
    all_chunk_files = glob.glob(search_str)
    df_list = [pd.read_csv(chunk_file,sep=',') for chunk_file in all_chunk_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [27]:
def filter_df(df:pd.DataFrame,eq_comp:list[str] = ['CONTRAEJE'],point_vibr:list[str]=['7AV','7HV','7VV','8AV','8HV','8VV']):
    df_copy = df.copy()
    path_tag_df = df_copy['Point Path'].str.split( r'\\' , expand = True)
    # df['points'] = path_tag_df[7]
    df_copy = df_copy[(path_tag_df[6].isin(eq_comp))]
    print(df_copy['point'].unique())
    df_copy = df_copy[(path_tag_df[7].isin(point_vibr))]
    df_copy.reset_index(inplace=True,drop=True)
    df_copy.drop(columns=df_copy.loc[:,df_copy.isna().sum() == len(df_copy)].columns, axis=1, inplace=True)
    # columns = [int(x) if x.isdigit() else x for x in df_copy.columns]
    # df_copy.columns = columns
    return df_copy

In [28]:
def fft_freq(df_time:pd.DataFrame):
    df_freq = df_time.copy()
    filter_vibr = [str(i) for i in range(8192)]
    df_freq = df_freq.drop(columns=filter_vibr, axis = 1)
    
    freq_data = []
    for i,row in df_time.iterrows():
        y = np.array(df_time.loc[i,filter_vibr].tolist())
        # vibration_data = data['Vibration'].values
        vibration_data = y
        # Time parameters
        sampling_rate = row['Samples']/row['Max Time']  # Hz, example sampling rate
        n = len(vibration_data)
        time = np.arange(n) / sampling_rate

        # Perform FFT
        freq_domain = np.fft.fft(vibration_data)
        freq = np.fft.fftfreq(n, d=1/sampling_rate)

        x_freq = freq[:n//2]
        y_freq = np.abs(freq_domain)[:n//2] * 1/n

        freq_data.append(y_freq)

    df_freq = pd.concat([df_freq,pd.DataFrame(freq_data)],axis=1)
    columns = [str(x) for x in df_freq.columns]
    df_freq.columns = columns
    return df_freq


In [29]:
# # chunk raw data
# list_rows = read_original_df(rute = 'dataset/Export_Time.csv')
# chunk_df(list_rows,chunks = 15)

In [30]:
# # Work with original data
# df = read_original_df(rute = 'dataset/Export_Time.csv')

In [31]:
# Work with chunk data
df = read_chunk_data(search_str = 'dataset/chunks/Export_Time_*.csv')

In [32]:
df.head()

Unnamed: 0,Point Path,Unit,Detection,Channel,Samples,Max Time,Speed (Hz),Process Value,DTS_new,point,...,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767
0,\Hierarchy\MOLIENDA\MOLINO BOLAS 04\3224-MOLIN...,,Peak,1,8192,3.2,16.0,,2024-06-24 10:42:11,1HV,...,,,,,,,,,,
1,\Hierarchy\MOLIENDA\MOLINO BOLAS 04\3224-MOLIN...,,Peak,1,8192,3.2,16.0,,2024-06-17 11:19:20,1HV,...,,,,,,,,,,
2,\Hierarchy\MOLIENDA\MOLINO BOLAS 04\3224-MOLIN...,,Peak,1,8192,3.2,16.0,,2024-06-10 09:46:03,1HV,...,,,,,,,,,,
3,\Hierarchy\MOLIENDA\MOLINO BOLAS 04\3224-MOLIN...,,Peak,1,8192,3.2,16.0,,2024-06-03 07:45:13,1HV,...,,,,,,,,,,
4,\Hierarchy\MOLIENDA\MOLINO BOLAS 04\3224-MOLIN...,,Peak,1,8192,3.2,16.0,,2024-05-26 09:32:36,1HV,...,,,,,,,,,,


In [33]:
# df_time = filter_df(df,eq_comp = ['CONTRAEJE'],point_vibr=['7AV','7HV','7VV','7HE3','8AV','8HV','8VV','8HE3'])
# df_freq = fft_freq(df_time)
# df_freq.to_csv('dataset/new_data/df_freq.csv',index=False)
# df_time.to_csv('dataset/new_data/df_time.csv',index=False)

In [34]:
df_time = filter_df(df,eq_comp = ['CONTRAEJE'],point_vibr=['7HE3','8HE3'])
df_time.to_csv('dataset/new_data/df_time_filtered.csv',index=False)

['7HV' '7HBaja' '7HE3' '7Hac' '7VV' '7AV' '7AO' '8HV' '8HBaja' '8HE3'
 '8Hac' '8VV' '8AV' '8AO']


  df_copy = df_copy[(path_tag_df[7].isin(point_vibr))]


In [46]:
df_time.head()

Unnamed: 0,point,0,1,2,3,4,5,6,7,8,...,8182,8183,8184,8185,8186,8187,8188,8189,8190,8191
0,7HE3,0.011129,-0.028003,0.014197,0.010107,-0.009144,-0.062354,-0.097606,-0.109307,-0.048638,...,-0.075197,-0.020183,-0.041208,-0.062775,-0.089665,-0.11072,-0.085183,-0.070926,-0.02659,-0.065692
1,7HE3,-0.052365,-0.082886,-0.158905,-0.079438,-0.038264,-0.143786,-0.015176,-0.050076,-0.113802,...,0.622197,-0.00698,-0.003532,0.102809,0.231166,-0.060448,0.217431,0.206014,0.071413,0.008563
2,7HE3,-0.103386,-0.102507,-0.001723,-0.033255,-0.070711,-0.082751,-0.105073,-0.119925,-0.030179,...,0.194889,0.076652,-0.096742,0.002162,0.048336,-0.108905,-0.045066,0.105776,-0.027613,-0.133793
3,7HE3,0.026032,-0.078261,-0.088135,-0.124419,-0.066095,-0.004417,0.025252,-0.15201,-0.105309,...,-0.056717,-0.12312,-0.117002,0.08282,-0.018402,-0.05993,-0.095576,-0.132616,-0.107742,-0.006213
4,7HE3,-0.075893,-0.053834,-0.083611,0.003957,-0.17357,-0.224701,-0.10806,-0.024566,-0.109588,...,-0.197314,-0.147828,-0.041923,-0.089449,-0.125339,0.097168,-0.132078,-0.178585,-0.09756,-0.055088


In [35]:
df_freq = fft_freq(df_time)
df_freq.to_csv('dataset/new_data/df_freq_filtered.csv',index=False)

In [47]:
df_freq.head()

Unnamed: 0,point,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,7HE3,0.046683,0.000261,0.001387,0.001039,0.001908,0.000345,0.000647,0.001954,0.001656,...,0.000218,0.000134,0.000162,0.000268,0.00037,0.000379,0.000378,0.000234,0.000257,8.6e-05
1,7HE3,0.0542,0.000927,0.002023,0.000301,0.000899,0.000901,0.001141,0.002287,0.000891,...,0.000219,0.000382,0.000441,0.000173,0.00049,0.000334,0.000204,0.000385,0.000148,0.00026
2,7HE3,0.054417,0.001476,0.000771,0.001044,0.000865,0.000452,0.000915,0.000723,0.002879,...,0.000158,0.000122,0.000295,0.00017,0.000361,0.001054,0.000247,0.000356,0.000613,0.00013
3,7HE3,0.062481,0.003246,0.000756,0.000754,0.000627,0.000975,0.001108,0.001495,0.00014,...,0.000372,0.000105,0.000244,0.000448,0.000477,0.000742,0.000603,0.000542,0.000359,0.001301
4,7HE3,0.07212,0.002794,0.001215,0.001415,0.000158,0.001252,0.000604,0.000356,0.002434,...,0.000165,0.000345,0.001764,0.000329,0.000535,0.000534,0.000139,0.000184,0.000211,0.00011


In [36]:
df_time.drop(columns=['Point Path', 'Detection', 'Channel', 'Samples', 'Max Time','Speed (Hz)','DTS_new'], axis=1, inplace=True)
df_freq.drop(columns=['Point Path', 'Detection', 'Channel', 'Samples', 'Max Time','Speed (Hz)','DTS_new'], axis=1, inplace= True)

In [37]:
count = int(df_time.groupby('point').agg({'point': ['count']}).reset_index(drop = True).reset_index(drop=True).min())
df_time = df_time.groupby('point').head(int(count))

  count = int(df_time.groupby('point').agg({'point': ['count']}).reset_index(drop = True).reset_index(drop=True).min())


In [38]:
count = int(df_freq.groupby('point').agg({'point': ['count']}).reset_index(drop = True).reset_index(drop=True).min())
df_freq = df_freq.groupby('point').head(int(count))

  count = int(df_freq.groupby('point').agg({'point': ['count']}).reset_index(drop = True).reset_index(drop=True).min())


In [49]:
def transformData(df_input:pd.DataFrame):
    df = df_input.copy()
    columns = [int(x) if x.isdigit() else x for x in df.columns]
    len_vibra = max([x for x in list(columns) if isinstance(x, (int, float))])
    df.loc[df['point'] == '7HE3','target'] = 0
    df.loc[df['point'] == '8HE3','target'] = 1
    df['index'] = df.groupby(['point','target']).cumcount()
    df_time_bad = df[df['point'] == '7HE3']
    df_time_good = df[df['point'] == '8HE3']
    df_time_bad.loc[df['point'] == '7HE3','point'] = 'HE3'
    df_time_good.loc[df['point'] == '8HE3','point'] = 'HE3'

    filter_vibr = [str(i) for i in range(len_vibra)]
    f_pivoted_bad = df_time_bad.pivot(index=['index','target'], columns=['point'], values=filter_vibr)
    f_pivoted_good = df_time_good.pivot(index=['index','target'], columns=['point'], values=filter_vibr)
    f_pivoted_bad.columns = ['_'.join(col).strip() for col in f_pivoted_bad.columns.values]
    f_pivoted_good.columns = ['_'.join(col).strip() for col in f_pivoted_good.columns.values]
    df_time_done = pd.concat([f_pivoted_good,f_pivoted_bad], axis=0)
    df_time_done.reset_index(inplace=True)
    df_time_done.drop(columns=['index'], axis=1, inplace=True)
    return df_time_done

In [50]:
df_time_done = transformData(df_time)
df_freq_done = transformData(df_freq)

In [51]:
df_time_done.head()

Unnamed: 0,target,0_HE3,1_HE3,2_HE3,3_HE3,4_HE3,5_HE3,6_HE3,7_HE3,8_HE3,...,8181_HE3,8182_HE3,8183_HE3,8184_HE3,8185_HE3,8186_HE3,8187_HE3,8188_HE3,8189_HE3,8190_HE3
0,1.0,-0.040887,0.006277,0.004097,0.015771,0.096539,0.0409,-0.033063,-0.011656,0.002947,...,-0.016141,-0.024219,-0.005113,-0.002166,-0.029985,-0.076784,0.021538,0.021411,0.011795,-0.005754
1,1.0,0.005245,-0.194973,-0.274077,-0.037272,-0.020225,-0.20293,0.038206,-0.10831,-0.259723,...,-0.053941,-0.162916,-0.035087,-0.171003,-0.249967,-0.174718,0.04654,-0.153946,-0.060349,0.051795
2,1.0,-0.045757,-0.01614,0.031918,-0.025115,-0.047552,-0.052164,-0.010498,0.006793,0.040769,...,-0.066017,-0.09204,-0.001781,0.032433,-0.005752,-0.05204,-0.074219,-0.005752,0.00474,-0.020117
3,1.0,-0.087019,-0.0256,-0.114839,-0.141508,0.041534,-0.07536,-0.093169,0.022566,-0.112536,...,-0.024959,-0.128311,0.036542,-0.006516,-0.121265,-0.051635,-0.069458,-0.117659,-0.031888,0.036783
4,1.0,-0.067937,-0.078963,0.039743,-0.067937,-0.070118,0.069999,-0.076139,-0.039349,-0.047935,...,-0.06204,0.063465,-0.066657,-0.054728,-0.078061,-0.0455,-0.054086,0.035769,-0.064734,-0.106789


In [52]:
df_freq_done.head()

Unnamed: 0,target,0_HE3,1_HE3,2_HE3,3_HE3,4_HE3,5_HE3,6_HE3,7_HE3,8_HE3,...,4085_HE3,4086_HE3,4087_HE3,4088_HE3,4089_HE3,4090_HE3,4091_HE3,4092_HE3,4093_HE3,4094_HE3
0,1.0,0.01897,0.001095,0.001061,0.001539,0.000536,0.000401,0.000258,0.000753,0.000873,...,0.000122,0.000132,0.000311,0.00017,0.000242,4.6e-05,0.000166,0.000136,0.000124,0.000226
1,1.0,0.096649,0.002541,0.000904,0.000214,0.00083,0.000382,0.000969,0.000542,0.000974,...,0.000846,0.000761,0.000471,0.000286,0.000355,0.000781,0.00058,0.000717,0.000266,0.000606
2,1.0,0.030549,0.0011,0.001033,0.000455,0.000489,0.00033,0.000448,0.000284,0.000363,...,0.000215,0.000316,0.000163,0.000253,0.0002,0.000364,0.000333,0.000202,0.000373,0.000174
3,1.0,0.060288,0.001842,0.000554,0.000437,0.000636,0.000214,0.000661,0.000875,0.000901,...,0.000208,0.000671,0.000537,0.000307,7.7e-05,0.000673,0.000726,0.000581,0.000491,0.000779
4,1.0,0.044945,0.001578,0.001163,0.000463,0.000499,0.000682,0.000298,0.00028,0.00031,...,8.9e-05,0.000156,0.000395,0.000641,0.000265,8e-05,0.000139,7.6e-05,0.000158,8.4e-05


In [53]:
df_time_done.to_csv('dataset/new_data/df_time_done.csv',index=False)
df_freq_done.to_csv('dataset/new_data/df_freq_done.csv',index=False)

In [54]:
def compute_skewness(x):
    
    n = len(x)
    third_moment = np.sum((x - np.mean(x))**3) / n
    s_3 = np.std(x, ddof = 1) ** 3
    return third_moment/s_3

In [55]:
def compute_kurtosis(x):
    
    n = len(x)
    fourth_moment = np.sum((x - np.mean(x))**4) / n
    s_4 = np.std(x, ddof = 1) ** 4
    return fourth_moment / s_4 - 3

In [63]:
df_time.columns[1:5]

Index(['0', '1', '2', '3'], dtype='object')

In [122]:
df_time_done.head()

Unnamed: 0,target,0_HE3,1_HE3,2_HE3,3_HE3,4_HE3,5_HE3,6_HE3,7_HE3,8_HE3,...,8181_HE3,8182_HE3,8183_HE3,8184_HE3,8185_HE3,8186_HE3,8187_HE3,8188_HE3,8189_HE3,8190_HE3
0,1.0,-0.040887,0.006277,0.004097,0.015771,0.096539,0.0409,-0.033063,-0.011656,0.002947,...,-0.016141,-0.024219,-0.005113,-0.002166,-0.029985,-0.076784,0.021538,0.021411,0.011795,-0.005754
1,1.0,0.005245,-0.194973,-0.274077,-0.037272,-0.020225,-0.20293,0.038206,-0.10831,-0.259723,...,-0.053941,-0.162916,-0.035087,-0.171003,-0.249967,-0.174718,0.04654,-0.153946,-0.060349,0.051795
2,1.0,-0.045757,-0.01614,0.031918,-0.025115,-0.047552,-0.052164,-0.010498,0.006793,0.040769,...,-0.066017,-0.09204,-0.001781,0.032433,-0.005752,-0.05204,-0.074219,-0.005752,0.00474,-0.020117
3,1.0,-0.087019,-0.0256,-0.114839,-0.141508,0.041534,-0.07536,-0.093169,0.022566,-0.112536,...,-0.024959,-0.128311,0.036542,-0.006516,-0.121265,-0.051635,-0.069458,-0.117659,-0.031888,0.036783
4,1.0,-0.067937,-0.078963,0.039743,-0.067937,-0.070118,0.069999,-0.076139,-0.039349,-0.047935,...,-0.06204,0.063465,-0.066657,-0.054728,-0.078061,-0.0455,-0.054086,0.035769,-0.064734,-0.106789


In [96]:
df_time_features = pd.DataFrame()

In [98]:
df_time_features['max'] = df_time_done[df_time_done.columns[1:]].max(axis = 1)

In [100]:
df_time_features['min'] = df_time_done[df_time_done.columns[1:]].min(axis = 1)

In [97]:
df_time_features['mean'] = df_time_done[df_time_done.columns[1:]].mean(axis = 1)

In [105]:
df_time_features['std'] = df_time_done[df_time_done.columns[1:]].std(ddof=1,axis = 1)

In [106]:
df_time_features['rms'] = df_time_done[df_time_done.columns[1:]].apply(lambda x: np.sqrt(np.mean(x**2)), axis=1)

In [109]:
df_time_features['skewness'] = df_time_done[df_time_done.columns[1:]].apply(lambda x: compute_skewness(x), axis=1)

In [117]:
df_time_features['kurtosis'] = df_time_done[df_time_done.columns[1:]].apply(lambda x: compute_kurtosis(x), axis=1)

In [118]:
df_time_features['crest_factor'] = df_time_features['max'] / df_time_features['rms']

In [119]:
df_time_features['form_factor'] = df_time_features['rms'] / df_time_features['mean']

In [123]:
df_time_features['target'] = df_time_done['target']

In [124]:
df_time_features.head()

Unnamed: 0,mean,max,min,std,rms,skewness,kurtosis,crest_factor,form_factor,target
0,-0.018963,0.142822,-0.091016,0.029629,0.035176,0.754447,0.790247,4.06018,-1.855001,1.0
1,-0.096633,0.284885,-0.325495,0.107155,0.144287,0.131053,-0.892353,1.97443,-1.493142,1.0
2,-0.030544,0.15642,-0.132173,0.041449,0.051486,0.564729,0.138655,3.038136,-1.685602,1.0
3,-0.060276,0.225913,-0.218432,0.069759,0.09219,0.406804,-0.322002,2.450524,-1.529453,1.0
4,-0.044948,0.176934,-0.150896,0.048619,0.066211,0.632929,0.339129,2.672289,-1.473038,1.0


In [125]:
df_time_features.to_csv('dataset/new_data/df_time_features.csv',index=False)