This notebook is for the extraction of tables for UT pdf

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import tabula

In [2]:
UT_pdf = os.path.join(os.path.dirname(os.getcwd()), '../Data/Raw_data/UT Measurements_Report.pdf')

In [3]:
#TO DO p7_table by hand

In [4]:
tables_scatter_atten = tabula.read_pdf(UT_pdf, pages=[8, 29,34], lattice=True)

In [5]:
#PIPE backscatter
table_0 = tables_scatter_atten[0].T.reset_index(drop=True).T 
table_0.rename(columns = {0:'Sample',
                          1:'backscatter_avg'}, inplace=True)
table_0.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'PIPE_ut_backscatter.csv'))

In [6]:
#TUBE attenuation
tables_scatter_atten[1].rename(columns = {'Unnamed: 0':'Sample'}, inplace=True)
tables_scatter_atten[1].to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'TUBE_ut_attenuation.csv'))

In [7]:
#PIPE attenuation
tables_scatter_atten[2].rename(columns = {'Unnamed: 0':'Sample'}, inplace=True)
tables_scatter_atten[2].to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'PIPE_ut_attenuation.csv'))

In [8]:
def clean_abs_col(df, suffix='', avg_pow_10=1000, std_pow_10=10000):
    '''
    remove useless row and calculate the final data
    Args:
     - df: data extracted from tabula
     - suffix: time window ('str')
     - avg_pow_10 : int to divide average value
     - std_pow_10 : int to divide SD value
    return df
    '''
    df.rename(columns = {'Value of internal friction Q-1':'Absorption_avg',
                           'Unnamed: 0':'Absorption_std'}, inplace=True)
    df.drop(0, inplace=True)
    df[['Absorption_avg', 'Absorption_std']] = df[['Absorption_avg', 'Absorption_std']].astype('float')
    df['Absorption_avg'] = df['Absorption_avg']/avg_pow_10
    df['Absorption_std'] = df['Absorption_std']/std_pow_10 
    df.set_index('Sample', inplace=True)
    df = df.add_suffix(suffix)
    return df

In [9]:
def merge_abs(df1, df2, suffix='', avg_pow_10=1000, std_pow_10=10000):
    df = pd.concat([df1.T.reset_index(drop=False).T, df2.T.reset_index(drop=False).T])
    df.rename(columns = {0:'Sample',
                              1:'Absorption_avg',
                               2:'Absorption_std'}, inplace=True)
    df = df.iloc[1:]
    df.reset_index(inplace=True, drop=True)
    df = clean_abs_col(df, suffix, avg_pow_10, std_pow_10)
    return df

In [10]:
tables = tabula.read_pdf(UT_pdf, pages=[54, 55, 56, 57, 58, 60, 61], lattice=True)

In [11]:
#TUBE absorp 50s
table_ut_50 = clean_abs_col(tables[0], '_50')
table_50BT = merge_abs(tables[1], tables[2], '_50')
tube_50 = pd.concat([table_50BT, table_ut_50])
tube_50.reset_index(inplace=True)
tube_50.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'TUBE_ut_absorption_50us.csv'))
tube_50

Unnamed: 0,Sample,Absorption_avg_50,Absorption_std_50
0,B1,0.001264,0.000519
1,B2,0.002486,0.000816
2,B3,0.001168,0.000555
3,B4,0.002084,0.000648
4,B5,0.001141,0.000447
5,B6,0.001323,0.000574
6,B7,0.001908,0.000926
7,B8,0.000933,0.000304
8,As received,0.001257,0.00021
9,Fully ferritic,0.003377,0.000817


In [12]:
#TUBE absorp 100s
table_100BT = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Raw_data/UT_missing_100us_Blind tubes.csv'), header=None)
table_100BT.rename(columns = {0:'Sample',
                              1:'Absorption_avg',
                               2:'Absorption_std'}, inplace=True)
table_100BT = clean_abs_col(table_100BT, '_100', 10000, 10000)
table_ut_100 = clean_abs_col(tables[3], '_100', 10000, 10000)

#Save tube 100s
tube_100 = pd.concat([table_100BT, table_ut_100])
tube_100.reset_index(inplace=True)
tube_100.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'TUBE_ut_absorption_100us.csv'))


In [13]:
#TUBE absorp 200s
table_ut_200 = clean_abs_col(tables[5], '_200', 10000, 100000)
table_200BT = merge_abs(tables[6], tables[7], '_200', 10000, 100000)

#Save tube 200s
tube_200 = pd.concat([table_200BT, table_ut_200])
tube_200.reset_index(inplace=True)
tube_200.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'TUBE_ut_absorption_200us.csv'))


In [14]:
#TUBE absorp 500s
table_ut_500 = clean_abs_col(tables[8],  '_500', 10000, 100000)
table_500BT = merge_abs(tables[9], tables[10], '_500', 10000, 100000)
#Save tube 500s
tube_500 = pd.concat([table_500BT, table_ut_500])
tube_500.reset_index(inplace=True)
tube_500.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'TUBE_ut_absorption_500us.csv'))


In [15]:
#PIPE absorp 50s
pipe_ut_50 = clean_abs_col(tables[12], '_50')
pipe_ut_50.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'PIPE_ut_absorption_50us.csv'))

In [16]:
#PIPE absorp 100s
pipe_ut_100 = clean_abs_col(tables[13])
pipe_ut_100.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Intermediate_data/'+'PIPE_ut_absorption_100us.csv'))