## Create an example data file

In [1]:
from extractor import Extractor, create_example

create_example( '../data/example.xlsx')

## Load file

In [2]:
ex = Extractor('../data/example.xlsx')
ex.read()
ex.get_data()

Unnamed: 0,Scatter,Property 1,Property 2,D,E,A,C,B
0,Scatter 1,P1,,0.761038,-0.205158,1.76405,,
1,Scatter 1,P1,,0.121675,0.313068,1.76405,,
2,Scatter 1,P1,,0.443863,-0.854096,1.76405,,
3,Scatter 1,P1,,0.333674,-2.55299,1.76405,,
4,Scatter 1,P1,,1.49408,0.653619,1.76405,,
5,Scatter 2,,P4,0.410599,,0.864436,-0.187184,0.950088
6,Scatter 2,,P4,0.410599,,-0.742165,1.53278,0.950088
7,Scatter 2,,P4,0.410599,,2.26975,1.46936,0.950088
8,Scatter 2,,P4,0.410599,,-1.45437,0.154947,0.950088
9,Scatter 2,,P4,0.410599,,0.0457585,0.378163,0.950088


## Extend features

### Using a transformation

A user-defined function can be passed to `apply_transform` function to transform one column to another (but only NaN values will be filled).

In [3]:
import numpy as np

ex.apply_transform('B', 'G', np.log)

def linear_func(x):
    return 2*x

ex.apply_transform('E', 'C', linear_func)

ex.apply_transform('B', 'E', linear_func, [7, 8, 10]) # not NaN at [10, 'E'], so only [7, 8] will be transformed

ex.get_data()

Unnamed: 0,Scatter,Property 1,Property 2,D,E,A,C,B,G
0,Scatter 1,P1,,0.761038,-0.205158,1.76405,-0.410317,,
1,Scatter 1,P1,,0.121675,0.313068,1.76405,0.626135,,
2,Scatter 1,P1,,0.443863,-0.854096,1.76405,-1.70819,,
3,Scatter 1,P1,,0.333674,-2.55299,1.76405,-5.10598,,
4,Scatter 1,P1,,1.49408,0.653619,1.76405,1.30724,,
5,Scatter 2,,P4,0.410599,,0.864436,-0.187184,0.950088,-0.0512
6,Scatter 2,,P4,0.410599,,-0.742165,1.53278,0.950088,-0.0512
7,Scatter 2,,P4,0.410599,1.900177,2.26975,1.46936,0.950088,-0.0512
8,Scatter 2,,P4,0.410599,1.900177,-1.45437,0.154947,0.950088,-0.0512
9,Scatter 2,,P4,0.410599,,0.0457585,0.378163,0.950088,-0.0512


### with curves

See what curves do we have.

In [4]:
print(ex.get_curves())
print(ex.get_curve_relation())

['Curve 1', 'Curve 2', 'Curve 3']
[['C', 'D'], ['A', 'C'], ['A', 'F']]


Extend a new feature 'F' using the third curve. Linear interpolation is implemented. The curves are linear without bias.

In [5]:
ex.interpolate_from_curve(ex.get_curves()[2], direction=0, ignore_property=True)
ex.get_data()

Interpolated data exceeds the curve range.
Interpolated feature "F" from feature "A" using "Curve 3"
	Indexes [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


Unnamed: 0,Scatter,Property 1,Property 2,D,E,A,C,B,G,F
0,Scatter 1,P1,,0.761038,-0.205158,1.76405,-0.410317,,,1.764052
1,Scatter 1,P1,,0.121675,0.313068,1.76405,0.626135,,,1.764052
2,Scatter 1,P1,,0.443863,-0.854096,1.76405,-1.70819,,,1.764052
3,Scatter 1,P1,,0.333674,-2.55299,1.76405,-5.10598,,,1.764052
4,Scatter 1,P1,,1.49408,0.653619,1.76405,1.30724,,,1.764052
5,Scatter 2,,P4,0.410599,,0.864436,-0.187184,0.950088,-0.0512,0.864436
6,Scatter 2,,P4,0.410599,,-0.742165,1.53278,0.950088,-0.0512,-0.742165
7,Scatter 2,,P4,0.410599,1.900177,2.26975,1.46936,0.950088,-0.0512,2.0
8,Scatter 2,,P4,0.410599,1.900177,-1.45437,0.154947,0.950088,-0.0512,-1.454366
9,Scatter 2,,P4,0.410599,,0.0457585,0.378163,0.950088,-0.0512,0.045759


## Fill missing data

Restore the original data first.

In [6]:
ex.restore_data()
ex.get_data()

Unnamed: 0,Scatter,Property 1,Property 2,D,E,A,C,B
0,Scatter 1,P1,,0.761038,-0.205158,1.76405,,
1,Scatter 1,P1,,0.121675,0.313068,1.76405,,
2,Scatter 1,P1,,0.443863,-0.854096,1.76405,,
3,Scatter 1,P1,,0.333674,-2.55299,1.76405,,
4,Scatter 1,P1,,1.49408,0.653619,1.76405,,
5,Scatter 2,,P4,0.410599,,0.864436,-0.187184,0.950088
6,Scatter 2,,P4,0.410599,,-0.742165,1.53278,0.950088
7,Scatter 2,,P4,0.410599,,2.26975,1.46936,0.950088
8,Scatter 2,,P4,0.410599,,-1.45437,0.154947,0.950088
9,Scatter 2,,P4,0.410599,,0.0457585,0.378163,0.950088


### with curves

The function `interpolate_from_curves` automatically apply curves to interpolate missing values.

In [7]:
ex.interpolate_from_curves(ignore_property=False)
ex.get_data()

Interpolated data exceeds the curve range.
Interpolated feature "F" from feature "A" using "Curve 3"
	Indexes [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Interpolated feature "C" from feature "D" using "Curve 1"
	Indexes [0, 1, 2, 3, 4]


Unnamed: 0,Scatter,Property 1,Property 2,D,E,A,C,B,F
0,Scatter 1,P1,,0.761038,-0.205158,1.76405,0.761038,,1.764052
1,Scatter 1,P1,,0.121675,0.313068,1.76405,0.121675,,1.764052
2,Scatter 1,P1,,0.443863,-0.854096,1.76405,0.443863,,1.764052
3,Scatter 1,P1,,0.333674,-2.55299,1.76405,0.333674,,1.764052
4,Scatter 1,P1,,1.49408,0.653619,1.76405,1.49408,,1.764052
5,Scatter 2,,P4,0.410599,,0.864436,-0.187184,0.950088,0.864436
6,Scatter 2,,P4,0.410599,,-0.742165,1.53278,0.950088,-0.742165
7,Scatter 2,,P4,0.410599,,2.26975,1.46936,0.950088,2.0
8,Scatter 2,,P4,0.410599,,-1.45437,0.154947,0.950088,-1.454366
9,Scatter 2,,P4,0.410599,,0.0457585,0.378163,0.950088,0.045759


### with machine learning models

Taking `RandomForestRegressor` from `sklearn` as an example. Missing values are filled through training a ML model with other features.

In [8]:
ex.restore_data()
ex.fill_na('B', criterion='RandomForest', remove_na_axis=1)
ex.get_data()

Fill NaN of feature B
	Predictors:  ['D', 'A'] 
	Training set [5, 6, 7, 8, 9, 10, 11, 12, 13, 14] 
	Pred set [0, 1, 2, 3, 4]
	R2 score 0.91156.


Unnamed: 0,Scatter,Property 1,Property 2,D,E,A,C,B
0,Scatter 1,P1,,0.761038,-0.205158,1.76405,,1.03266
1,Scatter 1,P1,,0.121675,0.313068,1.76405,,1.39047
2,Scatter 1,P1,,0.443863,-0.854096,1.76405,,1.03266
3,Scatter 1,P1,,0.333674,-2.55299,1.76405,,1.03266
4,Scatter 1,P1,,1.49408,0.653619,1.76405,,1.39047
5,Scatter 2,,P4,0.410599,,0.864436,-0.187184,0.950088
6,Scatter 2,,P4,0.410599,,-0.742165,1.53278,0.950088
7,Scatter 2,,P4,0.410599,,2.26975,1.46936,0.950088
8,Scatter 2,,P4,0.410599,,-1.45437,0.154947,0.950088
9,Scatter 2,,P4,0.410599,,0.0457585,0.378163,0.950088


The parameter `remove_na_axis` represents the axis to be dropped when NaNs appear in other features.

In [9]:
ex.restore_data()
ex.fill_na('B', criterion='RandomForest', remove_na_axis=0) # all rows are removed
ex.get_data()

Fill NaN of feature B
	No NaN to be filled.


Unnamed: 0,Scatter,Property 1,Property 2,D,E,A,C,B
0,Scatter 1,P1,,0.761038,-0.205158,1.76405,,
1,Scatter 1,P1,,0.121675,0.313068,1.76405,,
2,Scatter 1,P1,,0.443863,-0.854096,1.76405,,
3,Scatter 1,P1,,0.333674,-2.55299,1.76405,,
4,Scatter 1,P1,,1.49408,0.653619,1.76405,,
5,Scatter 2,,P4,0.410599,,0.864436,-0.187184,0.950088
6,Scatter 2,,P4,0.410599,,-0.742165,1.53278,0.950088
7,Scatter 2,,P4,0.410599,,2.26975,1.46936,0.950088
8,Scatter 2,,P4,0.410599,,-1.45437,0.154947,0.950088
9,Scatter 2,,P4,0.410599,,0.0457585,0.378163,0.950088
