<a href="https://colab.research.google.com/github/MarijaDragosevic/UUZOP-data-survey/blob/main/UZOP_data_survey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Classification of undesirable events in oil well operation**
### Data survey using 3w dataset.
######Dataset should be uploaded into google drive. Main folder is named data with subfolders named 0, 1, 2, 3, 4, 5, 6, 7, 8.


In [11]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [12]:
pip install tsfresh




In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
from pathlib import Path
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict
from natsort import natsorted
import tsfresh

  import pandas.util.testing as tm


In [15]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [14]:
data_path = Path('gdrive/MyDrive/', 'data')
events_names = {0: 'Normal',
                1: 'Abrupt Increase of BSW',
                2: 'Spurious Closure of DHSV',
                3: 'Severe Slugging',
                4: 'Flow Instability',
                5: 'Rapid Productivity Loss',
                6: 'Quick Restriction in PCK',
                7: 'Scaling in PCK',
                8: 'Hydrate in Production Line'
               }
columns = ['P-PDG',
           'P-TPT',
           'T-TPT',
           'P-MON-CKP',
           'T-JUS-CKP',
           'P-JUS-CKGL',
           'T-JUS-CKGL',
           'QGL',
           'class']


In [16]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

# Creating instances with paths to csv files

In [6]:
instances = list(class_and_file_generator(data_path, real=True, simulated=True, drawn=False))
real_instances = list(class_and_file_generator(data_path, real=True, simulated=False, drawn=False))
simulated_instances = list(class_and_file_generator(data_path, real=False, simulated=True, drawn=False))


In [7]:
instances_class = [{'TYPE OF EVENT': str(c) + ' - ' + events_names[c], 'SOURCE': 'REAL'} for c, p in real_instances] + \
                  [{'TYPE OF EVENT': str(c) + ' - ' + events_names[c], 'SOURCE': 'SIMULATED'} for c, p in simulated_instances] 
df_class = pd.DataFrame(instances_class)
df_class_count = df_class.groupby(['TYPE OF EVENT', 'SOURCE']).size().reset_index().pivot('SOURCE', 'TYPE OF EVENT', 0).fillna(0).astype(int).T
df_class_count = df_class_count.loc[natsorted(df_class_count.index.values)]
df_class_count = df_class_count[['REAL', 'SIMULATED']]
df_class_count['TOTAL'] = df_class_count.sum(axis=1)
df_class_count.loc['TOTAL'] = df_class_count.sum(axis=0)
df_class_count


SOURCE,REAL,SIMULATED,TOTAL
TYPE OF EVENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 - Normal,597,0,597
1 - Abrupt Increase of BSW,5,114,119
2 - Spurious Closure of DHSV,22,16,38
3 - Severe Slugging,32,74,106
4 - Flow Instability,344,0,344
5 - Rapid Productivity Loss,12,439,451
6 - Quick Restriction in PCK,6,215,221
7 - Scaling in PCK,4,0,4
8 - Hydrate in Production Line,3,81,84
TOTAL,1025,939,1964


# **Downsampling instances**

Paper said to downsample to 10 seconds but extraction was too long so I decided to downsample to 100 seconds.

The simulated events do not include the P-CKGL and TCKGL
sensor data, hence these features are removed from
the real well data.

On dataframe column class is added, it matches map where I found csv file.

In [14]:

for i in instances:
  name=str(i[1])
  c=name.split("/")
  name_new=name.replace("data","data_new")
  df = pd.read_csv(i[1]) 
  df['timestamp'] = pd.to_datetime(df['timestamp'])

  df_new = df.resample('100S', on='timestamp').mean()
  df_new = df_new.drop(['P-JUS-CKGL', 'T-JUS-CKGL', 'class'], axis=1)
  df_new=tsfresh.utilities.dataframe_functions.impute(df_new)
  df_new["class"]=c[3]
  df_new.to_csv(name_new,encoding='utf-8',)








KeyboardInterrupt: ignored

Downsampled data I chose to save on my google drive in map data_new in same way data was saved (with subfolders 0,1,2,3,4,5,6,7,8, depending on which class they belong to).

In [17]:
data_path_new = Path('gdrive/MyDrive/', 'data_new')
instances_new = list(class_and_file_generator(data_path_new, real=True, simulated=True, drawn=False))

# **Finding features from dataframe**

The TSFRESH
package is used for parallelized feature engineering as
it 1) contains a library of common features, while allowing
custom feature specification and 2) integrates well with scikitlearn.
The following features are calculated (for each window):


* The first to fourth moments (mean, variance, skewness,
kurtosis) of the time series and the absolute Fourier
transform of it
*   Miscellaneous features describing the distribution of the
data and how it changes: maximum, minimum, median,
quantiles, coefficient of variation, mean change, average
second derivative
*The coefficients of a linear and third degree polynomial
model. The linear model is fitted directly to the data,
while the polynomial is fitted as part

fc_params dictionary containes all wanted features.

In [18]:

fc_params = { 
    'mean': None,
    'variance': None,
    'skewness': None,
    'kurtosis': None,
    'fft_aggregated': [{'aggtype': 'centroid'},
  {'aggtype': 'variance'},
  {'aggtype': 'skew'},
  {'aggtype': 'kurtosis'}],
    'maximum': None,
    'minimum': None,
    'median': None,
    'quantile': [{'q': 0.1},
  {'q': 0.2},
  {'q': 0.3},
  {'q': 0.4},
  {'q': 0.6},
  {'q': 0.7},
  {'q': 0.8},
  {'q': 0.9}],
  'variation_coefficient': None,
  'mean_change': None,
'mean_second_derivative_central': None,

    'friedrich_coefficients': [
    {'coeff': 1, 'm': 3, 'r': 30},
    {'coeff': 3, 'm': 3, 'r': 30}],
}


# **Time windows subdivision**

### Windows are 300, 600 and 900 seconds.
The time series is subdivided into
windows, and in each window various features of arbitrary
complexity are calculated to describe dynamics within the
window. This division into time windows and feature calculations
reduce the problem to a supervised classification
problem. The choice of time window size is an important
hyperparameter as it controls the size of the new dataset and
the amount of information in the features.

In [23]:
size = 30
br=1
l=0
lista_klasa=[]
for i in instances_new:
  df= pd.read_csv(i[1])
  val = df['class'].values[0] 
  cut=len(df)%size
  df= df.iloc[:-cut]
  list_of_dfs = [df.loc[i:i+size-1,:] for i in range(0, len(df),size)]
  for dataf in list_of_dfs:
    dataf["id_features"]=br
    br+=1
  l+=len(list_of_dfs)
  if(len(list_of_dfs)!=0):
    data = pd.concat(list_of_dfs)
    lista_klasa.append(data)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Feature extraction can't have class column but we need to save it for training and testing model. Saving code chunk is presented below.

### Feature extraction for 300 s window:

In [None]:
data = pd.concat(lista_klasa)
s30=data['class'].value_counts()
s30=list(s30.items())
s30

In [None]:
d=data
data1 = d.drop(['class'], axis=1)
data2=tsfresh.extract_features(data1, default_fc_parameters=fc_params, column_id="id_features",column_sort="timestamp", column_kind=None, column_value=None)
data2


Since extraction takes a lot of time, easiest way is to save results to google drive to use it again.

In [None]:
data2.to_csv('gdrive/MyDrive/data30.csv',encoding='utf-8',)


### Feature extraction for 600 s window:

In [27]:
size = 60
br=1
l=0
lista_klasa=[]
for i in instances_new:
  df= pd.read_csv(i[1])
  val = df['class'].values[0] 
  cut=len(df)%size
  df= df.iloc[:-cut]
  list_of_dfs = [df.loc[i:i+size-1,:] for i in range(0, len(df),size)]
  for dataf in list_of_dfs:
    dataf["id_features"]=br
    br+=1
  l+=len(list_of_dfs)
  if(len(list_of_dfs)!=0):
    data = pd.concat(list_of_dfs)
    lista_klasa.append(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [None]:
data60 = pd.concat(lista_klasa)
data60


In [None]:
s60=data60['class'].value_counts()
s60=list(s60.items())
s60

In [None]:
d=data60
 
data1 = d.drop(['class'], axis=1)

data3=tsfresh.extract_features(data1, default_fc_parameters=fc_params, column_id="id_features",column_sort="timestamp", column_kind=None, column_value=None)
data3

In [None]:
data3.to_csv('gdrive/MyDrive/data60.csv',encoding='utf-8',)

### Feature extraction for 900 s window:

In [None]:
size = 90
br=1
l=0
lista_klasa=[]
for i in instances_new:
  df= pd.read_csv(i[1])
  val = df['class'].values[0] 
  cut=len(df)%size
  df= df.iloc[:-cut]
  list_of_dfs = [df.loc[i:i+size-1,:] for i in range(0, len(df),size)]
  for dataf in list_of_dfs:
    dataf["id_features"]=br
    br+=1
  l+=len(list_of_dfs)
  if(len(list_of_dfs)!=0):
    data = pd.concat(list_of_dfs)
    lista_klasa.append(data)

In [None]:
data90 = pd.concat(lista_klasa)
data90

In [None]:
s90=data90['class'].value_counts()
s90=list(s90.items())
s90

In [None]:
d=data90
 
data1 = d.drop(['class'], axis=1)

data4=tsfresh.extract_features(data1, default_fc_parameters=fc_params, column_id="id_features",column_sort="timestamp", column_kind=None, column_value=None)
data4


In [None]:
data4.to_csv('gdrive/MyDrive/data90.csv',encoding='utf-8',)
data4

### Appending saved classes to new dataframes:

In [None]:
y30=[]
for y,x in s30:
    y30.append((y,int(x/30)))
y30.sort(key=lambda x:x[0])
y30

In [None]:
y60=[]
for y,x in s60:
    y60.append((y,int(x/60)))
y60.sort(key=lambda x:x[0])
y60

In [None]:
y90=[]
for y,x in s90:
    y90.append((y,int(x/90)))
y90.sort(key=lambda x:x[0])
y90

In [36]:
data_30 = pd.read_csv('gdrive/MyDrive/data30.csv',index_col=0)
data_30=tsfresh.utilities.dataframe_functions.impute(data_30)

data_60 = pd.read_csv('gdrive/MyDrive/data60.csv',index_col=0)
data_60=tsfresh.utilities.dataframe_functions.impute(data_60)

data_90 = pd.read_csv('gdrive/MyDrive/data90.csv',index_col=0)
data_90=tsfresh.utilities.dataframe_functions.impute(data_90)

In [45]:
#feature selection
from sklearn.feature_selection import VarianceThreshold
def get_low_variance_columns(dframe=None, columns=None,skip_columns=None, thresh=0.0, autoremove=True):
    try:
        # get list of all the original df columns
        all_columns = dframe.columns
        skip_columns = dframe.filter(regex="variance", axis=1)

        # remove `skip_columns`
        remaining_columns = all_columns.drop(skip_columns)

        # get length of new index
        max_index = len(remaining_columns) - 1

        # get indices for `skip_columns`
        skipped_idx = [all_columns.get_loc(column)
                       for column
                       in skip_columns]

        # adjust insert location by the number of columns removed
        # (for non-zero insertion locations) to keep relative
        # locations intact
        for idx, item in enumerate(skipped_idx):
            if item > max_index:
                diff = item - max_index
                skipped_idx[idx] -= diff
            if item == max_index:
                diff = item - len(skip_columns)
                skipped_idx[idx] -= diff
            if idx == 0:
                skipped_idx[idx] = item

        # get values of `skip_columns`
        skipped_values = dframe.iloc[:, skipped_idx].values

        # get dataframe values
        X = dframe.loc[:, remaining_columns].values

        # instantiate VarianceThreshold object
        vt = VarianceThreshold(threshold=thresh)

        # fit vt to data
        vt.fit(X)

        # get the indices of the features that are being kept
        feature_indices = vt.get_support(indices=True)

        # remove low-variance columns from index
        feature_names = [remaining_columns[idx]
                         for idx, _
                         in enumerate(remaining_columns)
                         if idx
                         in feature_indices]

        # get the columns to be removed
        removed_features = list(np.setdiff1d(remaining_columns,
                                             feature_names))
        
        # remove the columns
        if autoremove:
            # remove the low-variance columns
            X_removed = vt.transform(X)
            
            # re-assemble the dataframe
            dframe = pd.DataFrame(data=X_removed,
                                  columns=feature_names)

            # add back the `skip_columns`
            for idx, index in enumerate(skipped_idx):
                dframe.insert(loc=index,
                              column=skip_columns[idx],
                              value=skipped_values[:, idx])
        # do not remove columns
        else:
            print("No changes have been made to the dataframe.")

    except Exception as e:
        print(e)
        print("Could not remove low-variance features. Something "
              "went wrong.")
        pass

    return dframe

In [None]:
#removing low variance features

data_30_final=get_low_variance_columns(data_30)
data_60_final=get_low_variance_columns(data_60)
data_90_final=get_low_variance_columns(data_90)
x_train90_final


In [37]:
def add_y_to_dataframe(y,df):
  lista=[]

  for i in y:

    l=[i[0] for j in range(int(i[1]))]
    lista.append(l)

  lista=[item for sublist in lista for item in sublist]
  df['class'] = lista
  return df

In [88]:
data_30_complete= add_y_to_dataframe(y30,data_30_final)
data_60_complete= add_y_to_dataframe(y60,data_60_final)
data_90_complete= add_y_to_dataframe(y90,data_90_final)
data_90_complete

Unnamed: 0,P-PDG__mean,P-PDG__skewness,P-PDG__kurtosis,"P-PDG__fft_aggregated__aggtype_""centroid""","P-PDG__fft_aggregated__aggtype_""skew""","P-PDG__fft_aggregated__aggtype_""kurtosis""",P-PDG__maximum,P-PDG__minimum,P-PDG__median,P-PDG__quantile__q_0.1,P-PDG__quantile__q_0.2,P-PDG__quantile__q_0.3,P-PDG__quantile__q_0.4,P-PDG__quantile__q_0.6,P-PDG__quantile__q_0.7,P-PDG__quantile__q_0.8,P-PDG__quantile__q_0.9,P-PDG__variation_coefficient,P-PDG__mean_change,P-PDG__mean_second_derivative_central,P-PDG__friedrich_coefficients__coeff_1__m_3__r_30,P-PDG__friedrich_coefficients__coeff_3__m_3__r_30,P-TPT__mean,P-TPT__skewness,P-TPT__kurtosis,"P-TPT__fft_aggregated__aggtype_""centroid""","P-TPT__fft_aggregated__aggtype_""skew""","P-TPT__fft_aggregated__aggtype_""kurtosis""",P-TPT__maximum,P-TPT__minimum,P-TPT__median,P-TPT__quantile__q_0.1,P-TPT__quantile__q_0.2,P-TPT__quantile__q_0.3,P-TPT__quantile__q_0.4,P-TPT__quantile__q_0.6,P-TPT__quantile__q_0.7,P-TPT__quantile__q_0.8,P-TPT__quantile__q_0.9,P-TPT__variation_coefficient,...,"T-JUS-CKP__fft_aggregated__aggtype_""kurtosis""",T-JUS-CKP__maximum,T-JUS-CKP__minimum,T-JUS-CKP__median,T-JUS-CKP__quantile__q_0.1,T-JUS-CKP__quantile__q_0.2,T-JUS-CKP__quantile__q_0.3,T-JUS-CKP__quantile__q_0.4,T-JUS-CKP__quantile__q_0.6,T-JUS-CKP__quantile__q_0.7,T-JUS-CKP__quantile__q_0.8,T-JUS-CKP__quantile__q_0.9,T-JUS-CKP__variation_coefficient,T-JUS-CKP__mean_change,T-JUS-CKP__mean_second_derivative_central,T-JUS-CKP__friedrich_coefficients__coeff_1__m_3__r_30,T-JUS-CKP__friedrich_coefficients__coeff_3__m_3__r_30,QGL__mean,QGL__skewness,QGL__kurtosis,"QGL__fft_aggregated__aggtype_""centroid""","QGL__fft_aggregated__aggtype_""skew""","QGL__fft_aggregated__aggtype_""kurtosis""",QGL__maximum,QGL__minimum,QGL__median,QGL__quantile__q_0.1,QGL__quantile__q_0.2,QGL__quantile__q_0.3,QGL__quantile__q_0.4,QGL__quantile__q_0.6,QGL__quantile__q_0.7,QGL__quantile__q_0.8,QGL__quantile__q_0.9,QGL__variation_coefficient,QGL__mean_change,QGL__mean_second_derivative_central,QGL__friedrich_coefficients__coeff_1__m_3__r_30,QGL__friedrich_coefficients__coeff_3__m_3__r_30,class
0,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,8.047289e+06,1.569728,1.791741,0.209430,13.921445,217.569699,8254891.52,7981643.94,8019729.000,7995530.267,7.999780e+06,8003281.695,8014381.548,8025538.688,8052564.239,8094140.560,8144027.823,0.008272,...,137.432364,72.878464,71.479743,72.181487,71.842443,71.923001,71.995637,72.066213,72.263042,72.307883,72.389287,72.493241,0.003751,-0.006489,0.003782,56.723567,99490.969581,0.000000,0.000000,0.000000,5.358397,2.203276,8.369657,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.162008,0.000000e+00,0.000000e+00,1.184812,0.931431,0
1,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,8.462376e+06,-0.751345,-1.021325,0.365946,10.549927,125.736824,8655868.54,8121397.73,8528164.930,8135104.036,8.224494e+06,8378405.325,8506387.388,8596917.076,8618216.000,8627720.102,8638166.000,0.022203,...,97.776591,75.709354,71.842660,73.578294,72.701561,72.835463,73.038080,73.279281,74.204303,74.790719,75.040740,75.157834,0.013816,0.027039,0.001217,55.673250,101299.764945,0.000000,0.000000,0.000000,5.358397,2.203276,8.369657,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.162008,0.000000e+00,0.000000e+00,1.184812,0.931431,0
2,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,8.738280e+06,-0.434350,-1.346230,0.296373,11.734466,154.540656,8934772.84,8461425.06,8817712.000,8488073.223,8.579637e+06,8616610.080,8704056.930,8827245.504,8879749.236,8890959.662,8907958.981,0.018055,...,108.897219,77.443963,74.624176,75.676015,74.859714,75.048403,75.156496,75.287896,76.005946,76.300771,76.775136,77.228596,0.011210,0.017934,-0.000417,89.293746,172294.691156,0.000000,0.000000,0.000000,5.358397,2.203276,8.369657,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.162008,0.000000e+00,0.000000e+00,1.184812,0.931431,0
3,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,8.149675e+06,-0.474125,-0.853943,0.054390,26.813762,820.568651,8185347.57,8101901.99,8155372.300,8119477.000,8.125425e+06,8139426.000,8145284.090,8159376.000,8163593.244,8169204.248,8173624.825,0.002515,...,137.665629,73.865039,72.629985,73.215180,72.803346,72.971375,73.056215,73.101755,73.324702,73.377239,73.458520,73.639411,0.004012,-0.002336,-0.000752,-186.823910,-333789.581315,0.000000,0.000000,0.000000,5.358397,2.203276,8.369657,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.162008,0.000000e+00,0.000000e+00,1.184812,0.931431,0
4,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,8.489612e+06,-0.625365,-0.776408,0.131656,17.364460,335.890192,8558368.00,8378950.21,8506282.355,8408449.037,8.440038e+06,8478404.423,8478569.000,8517148.678,8525778.636,8530162.150,8534116.829,0.005752,...,112.665988,75.615463,74.072256,74.990301,74.475759,74.672700,74.803069,74.891228,75.049783,75.154451,75.244836,75.419871,0.004520,0.009418,-0.007445,26.617680,49972.391203,0.000000,0.000000,0.000000,5.358397,2.203276,8.369657,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.162008,0.000000e+00,0.000000e+00,1.184812,0.931431,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4104,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,0.000000e+00,0.000000,0.000000,0.111128,10.423495,124.400546,0.00,0.00,0.000,0.000,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,0.004540,...,129.605291,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004585,0.000000,0.000000,0.012063,-12.573590,0.009297,-0.025449,-1.582127,15.173004,0.395411,4.811017,0.020930,1.103973e-28,9.232502e-03,1.112550e-20,7.644093e-04,2.384986e-03,6.251639e-03,1.359231e-02,1.557993e-02,0.016816,0.017968,0.773141,-1.729755e-04,1.120796e-05,1.184812,0.931431,8
4105,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,0.000000e+00,0.000000,0.000000,0.111128,10.423495,124.400546,0.00,0.00,0.000,0.000,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,0.004540,...,129.605291,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004585,0.000000,0.000000,0.012063,-12.573590,0.003120,0.900122,-0.457048,20.724681,-0.131060,12.597405,0.012890,1.590806e-28,1.027773e-03,9.828700e-23,3.567477e-20,1.585547e-11,4.030692e-05,3.124987e-03,5.069541e-03,0.006911,0.008838,1.196435,6.796615e-23,-6.079014e-05,1.184812,0.931431,8
4106,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,0.000000e+00,0.000000,0.000000,0.111128,10.423495,124.400546,0.00,0.00,0.000,0.000,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,0.004540,...,129.605291,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004585,0.000000,0.000000,0.012063,-12.573590,0.000908,2.863100,8.498652,20.271356,0.092073,14.853909,0.012201,4.464235e-26,4.853610e-24,7.970210e-26,7.970210e-26,1.731983e-25,1.731983e-25,4.500418e-23,5.030951e-17,0.000456,0.004274,2.466756,-6.796537e-23,0.000000e+00,1.184812,0.931431,8
4107,0.0,0.0,0.0,0.059546,9.122214,97.000191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002023,0.0,0.0,2.219661e-07,5.077007e+07,0.000000e+00,0.000000,0.000000,0.111128,10.423495,124.400546,0.00,0.00,0.000,0.000,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,0.004540,...,129.605291,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004585,0.000000,0.000000,0.012063,-12.573590,0.001159,1.910885,2.595244,21.033264,-0.077626,14.008852,0.008733,1.188331e-27,1.688636e-20,1.188331e-27,9.266086e-26,4.758935e-25,1.219757e-22,3.902180e-19,1.822374e-04,0.002566,0.005243,1.915907,3.762726e-23,-1.723017e-15,1.184812,0.931431,8


#**Feature normalization**

Before classification, all features should be normalised by subtracting the
mean and scaling by the standard deviation of the data. This
should be performed only considering the training set, i.e. excluding the currently held out fold in kfolds and the test
data. This scaling is necessary, because the pressure data is
orders of magnitude larger than the temperature data, and some
classification algorithms assume that features have zeros mean
and unit variance.

In [52]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
def normalization(df):
  mapper = DataFrameMapper([(df.columns, StandardScaler())])
  scaled_features = mapper.fit_transform(df.copy(), 144)
  scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
  return scaled_features_df

Train and test data splitting and then applying normalization only to train data.

In [53]:
from sklearn.model_selection import train_test_split

def split_normalize_train(df):

  train, test = train_test_split(df, test_size=0.3)
  y_train=train['class']
  x_train=train.drop(['class'],axis=1)
  x_train=normalization(x_train)

  y_test=test['class']
  x_test=test.drop(['class'],axis=1)
  return x_train,y_train,x_test,y_test

In [89]:
x_train30,y_train30, x_test30,y_test30 =split_normalize_train(data_30_complete)
x_train60,y_train60, x_test60,y_test60 =split_normalize_train(data_60_complete)
x_train90,y_train90, x_test90,y_test90 =split_normalize_train(data_90_complete)

7547     5
12654    6
13426    8
4691     2
11210    6
        ..
11104    6
7164     5
1247     0
10212    5
6819     4
Name: class, Length: 9639, dtype: int64

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score



In [77]:
def comparison_of_classifiers(classifier,params,x_train,y_train,x_test,y_test):
  clf = GridSearchCV(classifier, params, cv = 5, scoring='f1_macro')
  clf.fit(x_train,y_train)
  best_params=clf.best_params_
  f1=f1_score(y_test,clf.predict(x_test),average='macro')
  accuracy=accuracy_score(y_test,clf.predict(x_test))
  return best_params,f1,accuracy




In [70]:
lr=[]
params=[{'C':[1e-7, 1e-6,1e-5, 1e-4, 1e-3, 1e-2,1e-1]}]
classifier=LogisticRegression(max_iter=1000)
bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train30,y_train30,x_test30,y_test30)
lr.append([bp,f1,accuracy])
bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train60,y_train60,x_test60,y_test60)
lr.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train90,y_train90,x_test90,y_test90)
lr.append([bp,f1,accuracy])

lr

[[{'C': 0.1}, 0.0833549404802866, 0.11353183248608086],
 [{'C': 0.1}, 0.08414499670102518, 0.11413612565445026],
 [{'C': 0.1}, 0.05328649196305752, 0.0746147607461476]]

In [90]:
bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train90,y_train90,x_test90,y_test90)
lr.append([bp,f1,accuracy])

ValueError: ignored

In [None]:
lr

In [71]:
svc=[]
params=[{'C':[1e-7, 1e-6,1e-5, 1e-4, 1e-3, 1e-2,1e-1]}]
classifier=SVC()

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train30,y_train30, x_test30,y_test30)
svc.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train60,y_train60, x_test60,y_test60)
svc.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train90,y_train90, x_test90,y_test90)
svc.append([bp,f1,accuracy])

bp

{'C': 1e-07}

In [73]:
svc

[[{'C': 1e-07}, 0.050183241712477095, 0.2916969256838538],
 [{'C': 1e-07}, 0.047193585337915234, 0.2696335078534031],
 [{'C': 1e-07}, 0.05901486988847584, 0.30900243309002434]]

In [72]:
dt=[]
params=[{'ccp_alpha' : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]}]
classifier=DecisionTreeClassifier()

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train30,y_train30, x_test30,y_test30)
dt.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train60,y_train60, x_test60,y_test60)
dt.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train90,y_train90, x_test90,y_test90)
dt.append([bp,f1,accuracy])

dt

ValueError: ignored

In [None]:
rf=[]
params=[{'n_estimators': [50, 100, 150,175],'max_depth':[5, 7, 10], 'max_features': [5, 15]}]

classifier=RandomForestClassifier()

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train30,y_train30, x_test30,y_test30)
rf.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train60,y_train60, x_test60,y_test60)
rf.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train90,y_train90, x_test90,y_test90)
rf.append([bp,f1,accuracy])

rf

In [None]:
ada=[]
params=[{'base_estimator':[1, 3, 5], 'learning_rate':[0.01, 0.1, 1], 'n_estimators':[100, 250, 400, 550]}]

classifier=AdaBoostClassifier()

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train30_final,y_train30,x_test30,y_test30)
ada.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train60,y_train60, x_test60,y_test60)
ada.append([bp,f1,accuracy])

bp,f1,accuracy=comparison_of_classifiers(classifier,params,x_train90,y_train90, x_test90,y_test90)
ada.append([bp,f1,accuracy])

ada

In [None]:
classifier=DecisionTreeClassifier()
AdaBoostClassifier().get_params().keys()

