# Data Pipeline for C100 Cavity and Fault Classification 
*March 12, 2020* <br>

In [13]:
from datetime import datetime
startTime = datetime.now()

In [14]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
from scipy.stats import uniform, randint
from statsmodels.tsa.ar_model import AR
import warnings
from sklearn import preprocessing
from sklearn.externals import joblib

In [15]:
warnings.filterwarnings(action='ignore', category=FutureWarning)

def getStatARCoreffs(signals, maxLag, normalize):
    for i in range(0, np.shape(signals)[1]):
        if np.size(np.unique(signals[:, i])) == 1:
            parameters = np.zeros(maxLag + 1, dtype=np.float64)
        else:
            # integrated normalizer for speed
            if normalize:
                signals[:, i] = np.squeeze(signalScaler.fit_transform(signals[:, i].reshape(-1, 1)))

            model = AR(signals[:, i])
            model_fit = model.fit(maxLag, ic=None)
            if np.shape(model_fit.params)[0] < maxLag + 1:
                parameters = np.pad(model_fit.params, (0, maxLag + 1 - np.shape(model_fit.params)[0]),
                                    'constant', constant_values=0)
            elif np.shape(model_fit.params)[0] > maxLag + 1:
                parameters = model_fit.params[: maxLag]
            else:
                parameters = model_fit.params
        if i == 0:
            coefficients = parameters
        else:
            coefficients = np.append(coefficients, parameters, axis=0)

    return pd.DataFrame(coefficients).T

In [16]:
signalScaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
featureScaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)

cav_dict = {'0L04': 'R04', '1L07': 'R17', '1L22': 'R1M', '1L23': 'R1N', '1L24': 'R1O', '1L25': 'R1P',
            '1L26': 'R1Q', '2L22': 'R2M', '2L23': 'R2N', '2L24': 'R2O', '2L25': 'R2P', '2L26': 'R2Q'}

cavity_df = pd.DataFrame()

mainPath = Path('D:/RF WAVEFORMS/')
module_path = mainPath / 'rfw-Fall-2019/waveform-data/rf'
dir = mainPath / 'rfw-Fall-2019/labeled-examples'

filelist = [dir / 'example.txt']

sel_col = ["Time", "id", 
           "1_GMES", "1_GASK", "1_CRFP", "1_DETA2_", "2_GMES", "2_GASK", "2_CRFP", "2_DETA2_",
           "3_GMES", "3_GASK", "3_CRFP", "3_DETA2_", "4_GMES", "4_GASK", "4_CRFP", "4_DETA2_",
           "5_GMES", "5_GASK", "5_CRFP", "5_DETA2_", "6_GMES", "6_GASK", "6_CRFP", "6_DETA2_",
           "7_GMES", "7_GASK", "7_CRFP", "7_DETA2_", "8_GMES", "8_GASK", "8_CRFP", "8_DETA2_"]

norm_col = ["1_GMES", "1_GASK", "1_CRFP", "1_DETA2_", "2_GMES", "2_GASK", "2_CRFP", "2_DETA2_",
            "3_GMES", "3_GASK", "3_CRFP", "3_DETA2_", "4_GMES", "4_GASK", "4_CRFP", "4_DETA2_",
            "5_GMES", "5_GASK", "5_CRFP", "5_DETA2_", "6_GMES", "6_GASK", "6_CRFP", "6_DETA2_",
            "7_GMES", "7_GASK", "7_CRFP", "7_DETA2_", "8_GMES", "8_GASK", "8_CRFP", "8_DETA2_"]
k = 0
for i in filelist:
    data_file_path = dir / i

    log = pd.read_csv(data_file_path, sep='\t')

    m, n = log.shape

    for j in range(0, m):
        k += 1

        getDateTime = datetime.strptime(log.time[j], '%Y/%m/%d %H:%M:%S')

        if (log.zone[j] == '0L04'):
            print("Skipping 0L04.")
        else:

            date, time = log.time[j].split(" ", 1)
            date_format = date.replace("/", "_")
            time_format = time.replace(":", "")

            list1 = [time_format, '.', '?']
            ct = os.path.join(module_path, log.zone[j], date_format, "".join(list1))
            dir1 = glob.glob(ct)
            
            if dir1:
                dir2 = os.listdir(dir1[0])

                if len(dir2) == 8:
                    
                    module_df = pd.DataFrame()
                    
                    for file in range(0, 8):
                        f = os.path.join(dir1[0], dir2[file])
                        df = pd.read_csv(f, sep='\t')
                        sLength = len(df['Time'])
                        tStep = (df.Time[2] - df.Time[1])
                        
                        if (tStep < 0.1):
                            raise ValueError("Model assumes 0.20 ms sampling time.");
                        
                        df['id'] = pd.Series(k, index=df.index)
                        col = ['Time',
                                f'{file + 1}_IMES', f'{file + 1}_QMES', f'{file + 1}_GMES', f'{file + 1}_PMES', f'{file + 1}_IASK',
                                f'{file + 1}_QASK',
                                f'{file + 1}_GASK', f'{file + 1}_PASK', f'{file + 1}_CRFP', f'{file + 1}_CRFPP', f'{file + 1}_CRRP',
                                f'{file + 1}_CRRPP',
                                f'{file + 1}_GLDE', f'{file + 1}_PLDE', f'{file + 1}_DETA2_', f'{file + 1}_CFQE2_', f'{file + 1}_DFQES',
                                'id']
                        df.columns = col
                        module_df = pd.concat([module_df, df], axis=1, sort=False)
                        
                    module_df = module_df.astype(np.float64, copy=True)
                    module_df = module_df.loc[:, ~module_df.columns.duplicated()]
                    module_df = module_df[sel_col]
                                        
                    module_df = getStatARCoreffs(module_df[norm_col].values, maxLag=5, normalize=1)
                    cavity_df = cavity_df.append(module_df)
                    
                else:
                    print("Directory does not contain data files for all 8 cavities in the zone.")

            else:
                print("Directory: " + ct + " has no files.")

X_master = cavity_df

print("------------------------------------------------------------------------------------------------")
print("Number of training examples: {}".format(X_master.shape[0]))
print("Number of features: {}".format(X_master.shape[1]))

------------------------------------------------------------------------------------------------
Number of training examples: 1
Number of features: 192


In [17]:
RF_cavity = joblib.load('RF_CAVITY_03112020.sav')
RF_fault = joblib.load('RF_FAULT_03112020.sav')

### Cavity ID Prediction

In [18]:
cavityID = RF_cavity.predict(X_master)
cavityID_prob = RF_cavity.predict_proba(X_master)
cavityID_str = cavityID.astype(str)[0]
ID_confidence = float(cavityID_prob[0][cavityID]*100)

### Fault ID Prediction

In [19]:
le = preprocessing.LabelEncoder()
le.classes_ = np.load('le_fault_classes.npy')
le.classes_;

In [20]:
cavityFault = RF_fault.predict(X_master)
cavityFault_prob = RF_fault.predict_proba(X_master)
cavityFault_name = le.inverse_transform(cavityFault)
cavityFault_name_str = cavityFault_name.astype(str)[0]
fault_confidence = float(cavityFault_prob[0][cavityFault]*100)

### Summary

In [21]:
print('cavity', cavityID_str, '(',round(ID_confidence,2), ')', cavityFault_name_str, '(',round(fault_confidence,2),')')

cavity 5 ( 83.07 ) Controls Fault ( 68.25 )


In [22]:
print("Executing the notebook took:", datetime.now() - startTime, "(h:mm:ss)")

Executing the notebook took: 0:00:01.125985 (h:mm:ss)
