This notebook adds the following features to the hard drive dataframe:
1. moving average of smart values in the most recent w cycles
2. the standard deviation of sensor values in the most w recent cycles
3. velocity of change
4. The distance of observation from the mean of recent w observations   

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns

%matplotlib inline 


In [2]:
cleaned_Sgate_4_hard_drive_df = pd.read_csv('cleaned_Sgate_4_hard_drive_df.csv')

In [3]:
cleaned_Sgate_4_hard_drive_df.date = pd.to_datetime(cleaned_Sgate_4_hard_drive_df.date)

In [4]:
cleaned_Sgate_4_hard_drive_df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_184_raw,smart_242_raw,smart_241_raw,smart_194_raw,smart_193_raw,smart_9_raw,smart_198_raw,smart_197_raw,smart_188_raw,smart_187_raw,smart_5_raw
0,2019-01-01,S3000A9T,ST4000DM000,4000787030016,0,0.0,6.998938e+10,1.169269e+10,26.0,129430.0,14574.0,0.0,0.0,0.0,0.0,0.0
1,2019-01-02,S3000A9T,ST4000DM000,4000787030016,0,0.0,7.002360e+10,1.169393e+10,26.0,129430.0,14598.0,0.0,0.0,0.0,0.0,0.0
2,2019-01-03,S3000A9T,ST4000DM000,4000787030016,0,0.0,7.005824e+10,1.169529e+10,26.0,129430.0,14622.0,0.0,0.0,0.0,0.0,0.0
3,2019-01-04,S3000A9T,ST4000DM000,4000787030016,0,0.0,7.009495e+10,1.169654e+10,27.0,129430.0,14646.0,0.0,0.0,0.0,0.0,0.0
4,2019-01-05,S3000A9T,ST4000DM000,4000787030016,0,0.0,7.012896e+10,1.169782e+10,26.0,129430.0,14670.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567964,2019-09-26,Z307XHRJ,ST4000DM000,4000787030016,0,0.0,1.321189e+11,2.176368e+10,31.0,191.0,18522.0,0.0,0.0,0.0,0.0,0.0
5567965,2019-09-27,Z307XHRJ,ST4000DM000,4000787030016,0,0.0,1.323207e+11,2.179283e+10,29.0,191.0,18554.0,0.0,0.0,0.0,0.0,0.0
5567966,2019-09-28,Z307XHRJ,ST4000DM000,4000787030016,0,0.0,1.324661e+11,2.181590e+10,30.0,191.0,18570.0,0.0,0.0,0.0,0.0,0.0
5567967,2019-09-29,Z307XHRJ,ST4000DM000,4000787030016,0,0.0,1.326156e+11,2.185715e+10,31.0,191.0,18602.0,0.0,0.0,0.0,0.0,0.0


We won't use the capacity bites (since the hard drives have mainly the same capacity) and the smart_9_raw feature which is the number of operational hours 

In [5]:
cleaned_Sgate_4_hard_drive_df = cleaned_Sgate_4_hard_drive_df.drop(['capacity_bytes'],axis=1)

In [6]:
cleaned_Sgate_4_hard_drive_df.columns

Index(['date', 'serial_number', 'model', 'failure', 'smart_184_raw',
       'smart_242_raw', 'smart_241_raw', 'smart_194_raw', 'smart_193_raw',
       'smart_9_raw', 'smart_198_raw', 'smart_197_raw', 'smart_188_raw',
       'smart_187_raw', 'smart_5_raw'],
      dtype='object')

In [7]:
mask = ['smart_184_raw', 'smart_242_raw', 'smart_241_raw',
       'smart_194_raw', 'smart_193_raw', 'smart_198_raw',
       'smart_197_raw', 'smart_188_raw', 'smart_187_raw', 'smart_5_raw']
mask_mean_rw_10 = ['smart_184_rw10_mean','smart_242_rw10_mean','smart_241_rw10_mean',
       'smart_194_rw10_mean', 'smart_193_rw10_mean', 'smart_198_rw10_mean',
       'smart_197_rw10_mean','smart_188_rw10_mean', 'smart_187_rw10_mean','smart_5_rw10_mean']
mask_mean_rw_20 = ['smart_184_rw20_mean','smart_242_rw20_mean','smart_241_rw20_mean',
       'smart_194_rw20_mean', 'smart_193_rw20_mean', 'smart_198_rw20_mean',
       'smart_197_rw20_mean','smart_188_rw20_mean', 'smart_187_rw20_mean','smart_5_rw20_mean']
mask_mean_rw_30 = ['smart_184_rw30_mean','smart_242_rw30_mean','smart_241_rw30_mean',
       'smart_194_rw30_mean', 'smart_193_rw30_mean', 'smart_198_rw30_mean',
       'smart_197_rw30_mean','smart_188_rw30_mean', 'smart_187_rw30_mean','smart_5_rw30_mean']
mask_std_rw_10 = ['smart_184_rw10_std','smart_242_rw10_std','smart_241_rw10_std',
       'smart_194_rw10_std', 'smart_193_rw10_std', 'smart_198_rw10_std',
       'smart_197_rw10_std','smart_188_rw10_std', 'smart_187_rw10_std','smart_5_rw10_std']
mask_std_rw_20 = ['smart_184_rw20_std','smart_242_rw20_std','smart_241_rw20_std',
       'smart_194_rw20_std', 'smart_193_rw20_std', 'smart_198_rw20_std',
       'smart_197_rw20_std','smart_188_rw20_std', 'smart_187_rw20_std','smart_5_rw20_std']
mask_std_rw_30 = ['smart_184_rw30_std','smart_242_rw30_std','smart_241_rw30_std',
       'smart_194_rw30_std', 'smart_193_rw30_std', 'smart_198_rw30_std',
       'smart_197_rw30_std','smart_188_rw30_std', 'smart_187_rw30_std','smart_5_rw30_std']
mask_vel = ['smart_184_vel', 'smart_242_vel', 'smart_241_vel',
       'smart_194_vel', 'smart_193_vel', 'smart_198_vel',
       'smart_197_vel', 'smart_188_vel', 'smart_187_vel', 'smart_5_vel']
mask_from_mean_10 = ['smart_184_from_mean_10', 'smart_242_from_mean_10', 'smart_241_from_mean_10',
       'smart_194_from_mean_10', 'smart_193_from_mean_10', 'smart_198_from_mean_10',
       'smart_197_from_mean_10', 'smart_188_from_mean_10', 'smart_187_from_mean_10', 'smart_5_from_mean_10']
mask_from_mean_20 = ['smart_184_from_mean_20', 'smart_242_from_mean_20', 'smart_241_from_mean_20',
       'smart_194_from_mean_20', 'smart_193_from_mean_20', 'smart_198_from_mean_20',
       'smart_197_from_mean_20', 'smart_188_from_mean_20', 'smart_187_from_mean_20', 'smart_5_from_mean_20']
mask_from_mean_30 = ['smart_184_from_mean_30', 'smart_242_from_mean_30', 'smart_241_from_mean_30',
       'smart_194_from_mean_30', 'smart_193_from_mean_30', 'smart_198_from_mean_30',
       'smart_197_from_mean_30', 'smart_188_from_mean_30', 'smart_187_from_mean_30', 'smart_5_from_mean_30']

#### 1. moving average of smart values in the most recent w cycles


In [8]:
def moving_average(mask,mask_mean, n):
    grouped_df = cleaned_Sgate_4_hard_drive_df.groupby(['serial_number'])[mask].rolling(window=n).mean().reset_index()
    grouped_df = grouped_df.rename(columns = {key:val for (key,val) in zip(mask, mask_mean)})
    grouped_df = grouped_df.drop(['level_1','serial_number'], axis = 1)
    return grouped_df

In [9]:
cleaned_Sgate_4_hard_drive_df = pd.concat([cleaned_Sgate_4_hard_drive_df, moving_average(mask,mask_mean_rw_10,10)],axis = 1)

In [10]:
cleaned_Sgate_4_hard_drive_df = pd.concat([cleaned_Sgate_4_hard_drive_df, moving_average(mask,mask_mean_rw_20,20)],axis = 1)

In [11]:
cleaned_Sgate_4_hard_drive_df = pd.concat([cleaned_Sgate_4_hard_drive_df, moving_average(mask,mask_mean_rw_30,30)],axis = 1)

In [12]:
cleaned_Sgate_4_hard_drive_df.columns

Index(['date', 'serial_number', 'model', 'failure', 'smart_184_raw',
       'smart_242_raw', 'smart_241_raw', 'smart_194_raw', 'smart_193_raw',
       'smart_9_raw', 'smart_198_raw', 'smart_197_raw', 'smart_188_raw',
       'smart_187_raw', 'smart_5_raw', 'smart_184_rw10_mean',
       'smart_242_rw10_mean', 'smart_241_rw10_mean', 'smart_194_rw10_mean',
       'smart_193_rw10_mean', 'smart_198_rw10_mean', 'smart_197_rw10_mean',
       'smart_188_rw10_mean', 'smart_187_rw10_mean', 'smart_5_rw10_mean',
       'smart_184_rw20_mean', 'smart_242_rw20_mean', 'smart_241_rw20_mean',
       'smart_194_rw20_mean', 'smart_193_rw20_mean', 'smart_198_rw20_mean',
       'smart_197_rw20_mean', 'smart_188_rw20_mean', 'smart_187_rw20_mean',
       'smart_5_rw20_mean', 'smart_184_rw30_mean', 'smart_242_rw30_mean',
       'smart_241_rw30_mean', 'smart_194_rw30_mean', 'smart_193_rw30_mean',
       'smart_198_rw30_mean', 'smart_197_rw30_mean', 'smart_188_rw30_mean',
       'smart_187_rw30_mean', 'smart_5_rw

#### 2. the standard deviation of sensor values in the most w recent cycles


In [13]:
def moving_std(mask,mask_std,n):
    grouped_df = cleaned_Sgate_4_hard_drive_df.groupby(['serial_number'])[mask].rolling(window=n).std().reset_index()
    grouped_df = grouped_df.rename(columns = {key:val for (key,val) in zip(mask, mask_std)})
    grouped_df = grouped_df.drop(['level_1','serial_number'], axis = 1)
    return grouped_df

In [14]:
cleaned_Sgate_4_hard_drive_df = pd.concat([cleaned_Sgate_4_hard_drive_df, moving_std(mask,mask_std_rw_10,10)],axis = 1)

In [15]:
cleaned_Sgate_4_hard_drive_df = pd.concat([cleaned_Sgate_4_hard_drive_df, moving_std(mask,mask_std_rw_20,20)],axis = 1)

In [16]:
cleaned_Sgate_4_hard_drive_df = pd.concat([cleaned_Sgate_4_hard_drive_df, moving_std(mask,mask_std_rw_30,30)],axis = 1)

In [17]:
cleaned_Sgate_4_hard_drive_df.columns

Index(['date', 'serial_number', 'model', 'failure', 'smart_184_raw',
       'smart_242_raw', 'smart_241_raw', 'smart_194_raw', 'smart_193_raw',
       'smart_9_raw', 'smart_198_raw', 'smart_197_raw', 'smart_188_raw',
       'smart_187_raw', 'smart_5_raw', 'smart_184_rw10_mean',
       'smart_242_rw10_mean', 'smart_241_rw10_mean', 'smart_194_rw10_mean',
       'smart_193_rw10_mean', 'smart_198_rw10_mean', 'smart_197_rw10_mean',
       'smart_188_rw10_mean', 'smart_187_rw10_mean', 'smart_5_rw10_mean',
       'smart_184_rw20_mean', 'smart_242_rw20_mean', 'smart_241_rw20_mean',
       'smart_194_rw20_mean', 'smart_193_rw20_mean', 'smart_198_rw20_mean',
       'smart_197_rw20_mean', 'smart_188_rw20_mean', 'smart_187_rw20_mean',
       'smart_5_rw20_mean', 'smart_184_rw30_mean', 'smart_242_rw30_mean',
       'smart_241_rw30_mean', 'smart_194_rw30_mean', 'smart_193_rw30_mean',
       'smart_198_rw30_mean', 'smart_197_rw30_mean', 'smart_188_rw30_mean',
       'smart_187_rw30_mean', 'smart_5_rw

#### 3. velocity of change

In [18]:
grouped_df = cleaned_Sgate_4_hard_drive_df.groupby(['serial_number'])[mask].diff().reset_index()
day_diff = cleaned_Sgate_4_hard_drive_df.groupby(['serial_number'])[['date']].diff().reset_index().date.dt.days
div = grouped_df.divide(day_diff, axis = 0)

In [19]:
div

Unnamed: 0,index,smart_184_raw,smart_242_raw,smart_241_raw,smart_194_raw,smart_193_raw,smart_198_raw,smart_197_raw,smart_188_raw,smart_187_raw,smart_5_raw
0,,,,,,,,,,,
1,1.0,0.0,34223912.0,1234391.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,34636497.0,1358318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,36713608.0,1249222.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,34003260.0,1280948.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
5567964,5567964.0,0.0,187703483.0,22793000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5567965,5567965.0,0.0,201798278.0,29150392.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0
5567966,5567966.0,0.0,145384100.0,23075592.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5567967,5567967.0,0.0,149524212.0,41245984.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
div = div.rename(columns = {key:val for (key,val) in zip(mask, mask_vel)})


In [21]:
div = div.drop('index', axis = 1)
div

Unnamed: 0,smart_184_vel,smart_242_vel,smart_241_vel,smart_194_vel,smart_193_vel,smart_198_vel,smart_197_vel,smart_188_vel,smart_187_vel,smart_5_vel
0,,,,,,,,,,
1,0.0,34223912.0,1234391.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,34636497.0,1358318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,36713608.0,1249222.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,34003260.0,1280948.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5567964,0.0,187703483.0,22793000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5567965,0.0,201798278.0,29150392.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0
5567966,0.0,145384100.0,23075592.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5567967,0.0,149524212.0,41245984.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
div.columns

Index(['smart_184_vel', 'smart_242_vel', 'smart_241_vel', 'smart_194_vel',
       'smart_193_vel', 'smart_198_vel', 'smart_197_vel', 'smart_188_vel',
       'smart_187_vel', 'smart_5_vel'],
      dtype='object')

In [23]:
cleaned_Sgate_4_hard_drive_df = pd.concat([cleaned_Sgate_4_hard_drive_df, div],axis = 1)

In [24]:
cleaned_Sgate_4_hard_drive_df.columns

Index(['date', 'serial_number', 'model', 'failure', 'smart_184_raw',
       'smart_242_raw', 'smart_241_raw', 'smart_194_raw', 'smart_193_raw',
       'smart_9_raw', 'smart_198_raw', 'smart_197_raw', 'smart_188_raw',
       'smart_187_raw', 'smart_5_raw', 'smart_184_rw10_mean',
       'smart_242_rw10_mean', 'smart_241_rw10_mean', 'smart_194_rw10_mean',
       'smart_193_rw10_mean', 'smart_198_rw10_mean', 'smart_197_rw10_mean',
       'smart_188_rw10_mean', 'smart_187_rw10_mean', 'smart_5_rw10_mean',
       'smart_184_rw20_mean', 'smart_242_rw20_mean', 'smart_241_rw20_mean',
       'smart_194_rw20_mean', 'smart_193_rw20_mean', 'smart_198_rw20_mean',
       'smart_197_rw20_mean', 'smart_188_rw20_mean', 'smart_187_rw20_mean',
       'smart_5_rw20_mean', 'smart_184_rw30_mean', 'smart_242_rw30_mean',
       'smart_241_rw30_mean', 'smart_194_rw30_mean', 'smart_193_rw30_mean',
       'smart_198_rw30_mean', 'smart_197_rw30_mean', 'smart_188_rw30_mean',
       'smart_187_rw30_mean', 'smart_5_rw

In [25]:
df = cleaned_Sgate_4_hard_drive_df

#### 4. The distance of observation from the mean of recent w observations  

In [34]:
def dist_from_mean(mask,mask_mean,mask_from_mean):
    for raw,sm_mean, from_mean in zip(mask, mask_mean, mask_from_mean):
        cleaned_Sgate_4_hard_drive_df[from_mean] = cleaned_Sgate_4_hard_drive_df[raw]-cleaned_Sgate_4_hard_drive_df[sm_mean]
    

In [35]:
dist_from_mean(mask, mask_mean_rw_10, mask_from_mean_10)

In [36]:
dist_from_mean(mask, mask_mean_rw_20, mask_from_mean_20)

In [37]:
dist_from_mean(mask, mask_mean_rw_30, mask_from_mean_30)

In [38]:
print(cleaned_Sgate_4_hard_drive_df.columns[:50])
print(cleaned_Sgate_4_hard_drive_df.columns[50:])

Index(['date', 'serial_number', 'model', 'failure', 'smart_184_raw',
       'smart_242_raw', 'smart_241_raw', 'smart_194_raw', 'smart_193_raw',
       'smart_9_raw', 'smart_198_raw', 'smart_197_raw', 'smart_188_raw',
       'smart_187_raw', 'smart_5_raw', 'smart_184_rw10_mean',
       'smart_242_rw10_mean', 'smart_241_rw10_mean', 'smart_194_rw10_mean',
       'smart_193_rw10_mean', 'smart_198_rw10_mean', 'smart_197_rw10_mean',
       'smart_188_rw10_mean', 'smart_187_rw10_mean', 'smart_5_rw10_mean',
       'smart_184_rw20_mean', 'smart_242_rw20_mean', 'smart_241_rw20_mean',
       'smart_194_rw20_mean', 'smart_193_rw20_mean', 'smart_198_rw20_mean',
       'smart_197_rw20_mean', 'smart_188_rw20_mean', 'smart_187_rw20_mean',
       'smart_5_rw20_mean', 'smart_184_rw30_mean', 'smart_242_rw30_mean',
       'smart_241_rw30_mean', 'smart_194_rw30_mean', 'smart_193_rw30_mean',
       'smart_198_rw30_mean', 'smart_197_rw30_mean', 'smart_188_rw30_mean',
       'smart_187_rw30_mean', 'smart_5_rw

In [39]:
# save the dataframe
cleaned_Sgate_4_hard_drive_df.to_csv('features_cleaned_Sgate_4_hard_drive_df.csv', index=False)