<a href="https://colab.research.google.com/github/MarcoParola/torre-clima/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from datetime import date
from datetime import datetime as dt
from sklearn.preprocessing import MinMaxScaler


projectPath = 'drive/MyDrive/Colab Notebooks/leaning_tower/'
DATA_FILE = projectPath + 'data/data.txt'
DATA1_FILE = projectPath + 'data/data1.txt'

START_DATE ='1993-08-01 00:00:00'
END_DATE ='2007-06-30 00:00:00'
MISSING_VALUE_LABEL = 999999

thresholds = {
'Date time' : 0,
'DEFORMOMETRO INTERNO': [-0.5, 0.5],
'DEFORMOMETRO ESTERNO' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO SU SCALA' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.1' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO SU SCALA.1' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO SU SCALA.2' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO SU SCALA.3' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.2' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.3' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.4' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO SULLA SCALA' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO INTERNO.1' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO INTERNO.2' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO INTERNO.3' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO INTERNO.4' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.5' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.6' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.7' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.8' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.9' : [ -0.5, 0.5   ] ,
'DEFORMOMETRO SULLE SCALE' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.10' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO INTERNO.5' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO INTERNO.6' : [ -0.5, 0.5 ] ,
'DEFORMOMETRO ESTERNO.11' : [ -0.5, 0.5 ] ,
'TEL. 0 VERT. SUD DIR. N-S' : [ -2100, 1800 ] ,
'TEL. 0 VERT. SUD DIR. E-O' : [ -2100, 1800 ] ,
'Velocita vento istantanea': [0, 45] ,
'Direzione vento istantanea' : [ 0, 360 ] ,
'Termometro Aria Piano 8' : [ -10, 42] ,
'Irraggiamento Solare' : [ 0, 1000 ] ,
'TEL. 4 VERT. NORD DIR. N-S' : [ -2100, 50],
'TEL. 4 VERT. NORD DIR. E-O' : [ -2100, 50],
'TEL. 2 VERT. NORD DIR. N-S' : [ -2100, 50],
'TEL. 2 VERT. NORD DIR. E-O' : [ -2100, 50],
'TEL. 0 VERT. NORD DIR. N-S' : [ -2100, 50],
'TEL. 0 VERT. NORD DIR. E-O' : [ -2100, 50],
'TEL. 4 VERT. SUD DIR. N-S' : [ -2100, 50],
'TEL. 4 VERT. SUD DIR. E-O' : [ -2100, 50],
'TEL. 2 VERT. SUD DIR. N-S' : [ -2100, 50],
'TEL. 2 VERT. SUD DIR. E-O' : [ -2100, 50]
}

cols=[i for i in list(thresholds.keys()) if i not in ["Date time"]] 


def load_dataset(file_mame):
  df = pd.read_csv(file_mame, sep=';')
  df['Date time'] = pd.to_datetime(df['Date time'])
  time_mask = (df['Date time'] >= pd.Timestamp(START_DATE)) & (df['Date time'] <= pd.Timestamp(END_DATE))
  df = df[time_mask]
  df.set_index('Date time')
  df = df.iloc[:, :-1]
  return df


def zscore(x, window=100):
    r = x.rolling(window=window)
    m = r.mean().shift(1)
    s = r.std(ddof=0).shift(1)
    z = (x-m)/s
    return z

#Data preprocessing 

1.   Load data
2.   drop record before 1993-08-01 and after 2006-06-30
3.   channels selction
4.   convert data from string to float
5.   remove outliers by upper and lower thresholds

min-max scaling

hourly resampling


In [None]:
# 1. 2.
df = pd.read_csv(DATA_FILE, sep=';')
df['Date time'] = pd.to_datetime(df['Date time'])
time_mask = (df['Date time'] >= pd.Timestamp(START_DATE)) & (df['Date time'] <= pd.Timestamp(END_DATE))
df = df[time_mask]
df.set_index('Date time')
df = df.iloc[:, :-1]


# 3.
df = df[df.columns.intersection(list(thresholds.keys()))] 

# 4.
cols=[i for i in list(thresholds.keys()) if i not in ["Date time"]] 
for col in cols:
    df[col]=pd.to_numeric(df[col], errors='coerce')
 

# 6.
for col in cols:
  df.loc[df[col] <= thresholds[col][0], col] = np.nan
  df.loc[df[col] >= thresholds[col][1], col] = np.nan
  df.loc[abs(zscore(df[col], 100)) > 3, col] = np.nan
  df[col] = df[col].interpolate(method='nearest')
df = df.ffill() # usefull if last elements are nan
df = df.bfill() # usefull if first elements are nan


df = df.resample('180min', on='Date time').mean()

for col in cols:
  df[col] = df[col].interpolate(method='nearest')
  df[col] = (df[col] - np.mean(df[col]) ) / df[col].std()

df['Date time'] = df.index

  exec(code_obj, self.user_global_ns, self.user_ns)


save preprocessed data

In [None]:
df.to_csv(projectPath + 'preprocessed_data.csv')