# Calculate NA Value Ratio

## Summary
This document gives a function to calculate the na value ratio and it's description

### Imports
Imports should be grouped in the following order:
1. Magics

2. Alphabetical order
    
    A. standard librarby imports
    
    B. related 3rd party imports
    
    C. local application/library specific imports

In [1]:
# Magics
%matplotlib inline
# Do below if you want interactive matplotlib plot ()
# %matplotlib notebook

# Reload modules before executing user code
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

# Show version information for dependency modules
# https://github.com/jrjohansson/version_information
%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas

Software,Version
Python,3.5.2 64bit [MSC v.1900 64 bit (AMD64)]
IPython,5.1.0
OS,Windows 7 6.1.7601 SP1
numpy,1.11.1
scipy,0.18.1
matplotlib,1.5.3
pandas,0.18.1
Fri Dec 09 12:30:11 2016 W. Europe Standard Time,Fri Dec 09 12:30:11 2016 W. Europe Standard Time


In [1]:
# Standard library
import os
import sys
# sys.path.append('../src/')

# Third party imports
import pandas as pd
import math
import pandasql as pdsql
import numpy as np

# Local imports

### Load data

In [2]:
df = pd.read_pickle('../data/wss_n1')
df.head(5)

Unnamed: 0,itapudid,max1stdetectwssc,max1stdetectwssd,max1stdetectwsse,max1stdetectwssf,eventtime
0,163540005001DC915C983,,,,,2016-12-12 13:33:12
1,163540005001DC915C983,,,,,2016-12-12 13:38:10
2,163540005001DC915C983,,,,,2016-12-12 16:31:04
3,163540005001DC915C983,,,,,2017-01-29 08:56:22
4,163540006001DC915B4EF,,,,,2016-12-12 14:04:23


In [3]:
# This itapudid has 4 wss, and the poweroffevents from this itapudid are far less than the others. It is not representative so will be removed.
df = df[df.itapudid != '170540055001DC915C90E'] 

In [4]:
# This function is used to calculate the na value ratio of each itap. It will return a dataframe of na value ratio by itapudid
def cal_ratio(df_in):
    
    df1 = df_in.fillna(0); # replace all the na value data with 0 to do later calculation

    # Calculate the sum of wss speed by each position grouped by itapudid. This results are used to determined the sensor position of the itap.
    df_sum = df1.groupby('itapudid').sum().reset_index()
    # Rename the sum value for better looking
    df_sum = df_sum.rename(columns={"max1stdetectwssc": "sumc", "max1stdetectwssd": "sumd", "max1stdetectwsse": "sume", "max1stdetectwssf": "sumf"})
    df_sum = df_sum.set_index(['itapudid']) #reset the index by itapudid to get the speed of sensor by itapudid
    
    # Add a new column na_value
    df_in = pd.DataFrame(df_in, columns=['itapudid', 'max1stdetectwssc', 'max1stdetectwssd', 'max1stdetectwsse', 'max1stdetectwssf', 'eventtime', 'na_value'])
    df_in = df.reset_index(drop=True)
    
    # Calculate the number of na value each poweroffeevent
    for i in range(df_in.shape[0]):
        df_in.set_value(i,'na_value',cal_na(df_sum, df_in.get_value(i,'itapudid'), df_in.get_value(i,'max1stdetectwssc'),df_in.get_value(i,'max1stdetectwssd'),df_in.get_value(i,'max1stdetectwsse'),df_in.get_value(i,'max1stdetectwssf')))
    
    # Calculate the sum of na_value time by itapudid
    df_in = df_in[['itapudid','na_value']]
 
    df2 = df_in.groupby('itapudid').sum().reset_index()
    df3 = df_in.groupby('itapudid').count().reset_index()
    df3 = df3.rename(columns={"na_value": "count"})

    df_new = pd.merge(df2, df3, how='left', on='itapudid')
    df_new = pd.DataFrame(df_new, columns=['itapudid', 'na_value', 'count', 'na_value_ratio'])
    #calculate the na value ratio per itap
    for i in range(df_new.shape[0]):
        df_new.set_value(i,'na_value_ratio',(df_new.get_value(i,'na_value'))/(df_new.get_value(i,'count')*2))     
    df_new = df_new[['itapudid','na_value_ratio']]
    
    return df_new

In [5]:
# This function is used to calculate how many times na value appears in a single poweroffevent
def cal_na(df_sum, itapudid, wssc, wssd, wsse, wssf):
    nc = 0
    
    # This condition determine whether the sensors position are at C,D or not
    if ((df_sum.ix[itapudid, 'sumc'] > 0) | (df_sum.ix[itapudid, 'sumd'] > 0)):
        if math.isnan(wssc):
            nc += 1
        if math.isnan(wssd):
            nc += 1
    else:
        if math.isnan(wsse):
            nc += 1
        if math.isnan(wssf):
            nc += 1
    return nc

In [6]:
# Calculate the na value ratio
result = cal_ratio(df)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


In [7]:
result.to_pickle('../data/na_ratio_wss')

In [8]:
result

Unnamed: 0,itapudid,na_value_ratio
0,163540005001DC915C983,1.000000
1,163540006001DC915B4EF,1.000000
2,163540011001DC924C17D,0.004306
3,163540014001DC924D43A,0.018853
4,163540015001DC924C174,0.258778
5,163540017001DC924DAFD,0.007313
6,163540018001DC915C935,0.319157
7,163540020001DC924C70B,0.002775
8,163540021001DC9248028,0.999781
9,163540022001DC915C968,0.005491
