In [1]:
# add by qinglin, for jupyter notebook
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from asa.dataset import Dataset

# construct dataset

In [5]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [6]:
data = np.array([x, y, z]).T

In [7]:
Dataset(data)

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x0' 'x1' 'x2']
  Labels: ['x0' 'x1' 'x2']

In [8]:
Dataset(data, names=['x', 'y', 'z'])

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']

In [9]:
Dataset(data, names=['x', 'y', 'z'], labels=['x_label', 'y_label', 'z_label'])

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x_label' 'y_label' 'z_label']

In [10]:
data_df = pd.DataFrame(data, columns=['x', 'y', 'z'])

In [11]:
Dataset(data_df)

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']

In [12]:
data

array([[-1.21893452,  2.29650891, -2.55720456],
       [-0.21943088,  1.29335721, -0.88632399],
       [ 0.85681375,  0.73257075,  0.27043271],
       ...,
       [-0.48422288, -0.48675169, -0.32625907],
       [ 1.06487691,  2.60082971,  0.76772253],
       [ 0.98862483,  0.97223495,  1.55633418]])

In [13]:
Dataset(data_df).data

Unnamed: 0,x,y,z
0,-1.218935,2.296509,-2.557205
1,-0.219431,1.293357,-0.886324
2,0.856814,0.732571,0.270433
3,0.165836,-0.057274,-2.665473
4,-0.791382,1.290133,-1.231284
...,...,...,...
995,-0.823176,1.700775,0.277481
996,1.336907,1.820929,0.574967
997,-0.484223,-0.486752,-0.326259
998,1.064877,2.600830,0.767723


# get data by name

In [14]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [15]:
data = np.array([x, y, z]).T

In [16]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [17]:
np.array_equal(dataset['x'], x)

True

In [19]:
np.array_equal(dataset[['x', 'y']], np.array([x, y]).T)

True

You can also use index

In [20]:
np.array_equal(dataset[:, 0], x)

True

In [21]:
np.array_equal(dataset[3:5, 'x'], x[3:5])

True

# summary

In [22]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [23]:
data = np.array([x, y, z]).T

In [24]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [25]:
dataset

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']

In [26]:
print(dataset)

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']



In [28]:
dataset.summary()

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']



In [16]:
dataset.summary(stats_info=True)

Dataset summary:
  Data shape: (1000, 3)
  Names: ['x' 'y' 'z']
  Labels: ['x' 'y' 'z']

                 x            y            z
count  1000.000000  1000.000000  1000.000000
mean     -0.025854     1.071247    -0.044440
std       1.015677     1.806471     1.518553
min      -3.838911    -3.227975    -5.862311
25%      -0.708563    -0.091099    -1.003846
50%      -0.015643     0.835123     0.123353
75%       0.667487     1.865543     1.014457
max       3.554479    14.383491     3.916127


# add col

In [37]:
x = np.random.normal(size=1000)
y = x**2 + np.random.normal(size=1000)
z = np.log(np.abs(x + y)) + np.random.normal(size=1000)

In [46]:
data = np.array([x, y, z]).T

In [47]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [48]:
x2 = np.random.normal(size=1000)
y2 = 5 * x2

data2 = np.array([x2, y2]).T

In [49]:
dataset.data

Unnamed: 0,x,y,z
0,-1.137142,-0.974268,0.909596
1,1.029991,1.074175,0.626206
2,1.552315,2.349624,2.781325
3,-1.699347,3.685266,0.243586
4,0.214618,1.147696,0.111977
...,...,...,...
995,-0.550150,0.938486,-1.859269
996,-0.165516,-0.497314,1.159583
997,-0.209933,-2.624383,0.815311
998,0.198190,-0.476242,-2.123235


In [50]:
dataset.add_col(data2, ['x2', 'y2'], ['x2', 'y2'])

In [51]:
dataset.data

Unnamed: 0,x,y,z,x2,y2
0,-1.137142,-0.974268,0.909596,0.348718,1.743589
1,1.029991,1.074175,0.626206,-0.124680,-0.623402
2,1.552315,2.349624,2.781325,1.701707,8.508534
3,-1.699347,3.685266,0.243586,0.869031,4.345154
4,0.214618,1.147696,0.111977,-0.187036,-0.935178
...,...,...,...,...,...
995,-0.550150,0.938486,-1.859269,-0.116637,-0.583184
996,-0.165516,-0.497314,1.159583,0.875178,4.375890
997,-0.209933,-2.624383,0.815311,-0.774655,-3.873276
998,0.198190,-0.476242,-2.123235,-0.401572,-2.007860


In [52]:
dataset.labels

array(['x', 'y', 'z', 'x2', 'y2'], dtype='<U2')

In [53]:
dataset.names

array(['x', 'y', 'z', 'x2', 'y2'], dtype='<U2')

In [54]:
dataset.add_col(x2, 'x3', 'x3')

In [55]:
dataset.data

Unnamed: 0,x,y,z,x2,y2,x3
0,-1.137142,-0.974268,0.909596,0.348718,1.743589,0.348718
1,1.029991,1.074175,0.626206,-0.124680,-0.623402,-0.124680
2,1.552315,2.349624,2.781325,1.701707,8.508534,1.701707
3,-1.699347,3.685266,0.243586,0.869031,4.345154,0.869031
4,0.214618,1.147696,0.111977,-0.187036,-0.935178,-0.187036
...,...,...,...,...,...,...
995,-0.550150,0.938486,-1.859269,-0.116637,-0.583184,-0.116637
996,-0.165516,-0.497314,1.159583,0.875178,4.375890,0.875178
997,-0.209933,-2.624383,0.815311,-0.774655,-3.873276,-0.774655
998,0.198190,-0.476242,-2.123235,-0.401572,-2.007860,-0.401572


In [56]:
dataset.names

array(['x', 'y', 'z', 'x2', 'y2', 'x3'], dtype='<U2')

In [57]:
dataset['x4'] = x

In [58]:
dataset.names

array(['x', 'y', 'z', 'x2', 'y2', 'x3', 'x4'], dtype='<U2')

In [61]:
dataset.data

Unnamed: 0,x,y,z,x2,y2,x3,x4
0,-1.137142,-0.974268,0.909596,0.348718,1.743589,0.348718,-1.137142
1,1.029991,1.074175,0.626206,-0.124680,-0.623402,-0.124680,1.029991
2,1.552315,2.349624,2.781325,1.701707,8.508534,1.701707,1.552315
3,-1.699347,3.685266,0.243586,0.869031,4.345154,0.869031,-1.699347
4,0.214618,1.147696,0.111977,-0.187036,-0.935178,-0.187036,0.214618
...,...,...,...,...,...,...,...
995,-0.550150,0.938486,-1.859269,-0.116637,-0.583184,-0.116637,-0.550150
996,-0.165516,-0.497314,1.159583,0.875178,4.375890,0.875178,-0.165516
997,-0.209933,-2.624383,0.815311,-0.774655,-3.873276,-0.774655,-0.209933
998,0.198190,-0.476242,-2.123235,-0.401572,-2.007860,-0.401572,0.198190


In [62]:
np.array_equal(dataset['x4'], dataset['x'])

True

# add row

In [95]:
x = np.random.normal(size=2)
y = x**2 + np.random.normal(size=2)
z = np.log(np.abs(x + y)) + np.random.normal(size=2)

data = np.array([x, y, z]).T

In [100]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [101]:
dataset.data

Unnamed: 0,x,y,z
0,1.233423,0.604179,-0.128928
1,-0.425753,-0.458585,-0.02392


In [102]:
dataset.add_row(data)

In [103]:
dataset.data

Unnamed: 0,x,y,z
0,1.233423,0.604179,-0.128928
1,-0.425753,-0.458585,-0.02392
2,1.233423,0.604179,-0.128928
3,-0.425753,-0.458585,-0.02392


# delete col

In [108]:
x = np.random.normal(size=2)
y = x**2 + np.random.normal(size=2)
z = np.log(np.abs(x + y)) + np.random.normal(size=2)

data = np.array([x, y, z]).T

In [109]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [110]:
dataset.data

Unnamed: 0,x,y,z
0,-0.443916,0.493411,-4.114659
1,-1.62954,0.429475,0.724056


In [111]:
dataset.del_col(1)

In [112]:
dataset.data

Unnamed: 0,x,z
0,-0.443916,-4.114659
1,-1.62954,0.724056


In [113]:
dataset.summary()

Dataset summary:
  Data shape: (2, 2)
  Names: ['x' 'z']
  Labels: ['x' 'z']



In [114]:
dataset.del_col('x')

In [115]:
dataset.data

Unnamed: 0,z
0,-4.114659
1,0.724056


In [116]:
dataset.summary()

Dataset summary:
  Data shape: (2, 1)
  Names: ['z']
  Labels: ['z']



# delete row

In [136]:
x = np.random.normal(size=5)
y = x**2 + np.random.normal(size=5)
z = np.log(np.abs(x + y)) + np.random.normal(size=5)

data = np.array([x, y, z]).T

In [137]:
dataset = Dataset(data, ['x', 'y', 'z'], ['x', 'y', 'z'])

In [138]:
dataset.data

Unnamed: 0,x,y,z
0,1.206616,0.507058,-0.204432
1,1.882319,3.982455,2.71117
2,1.653985,2.708242,1.677328
3,-0.733088,0.431928,-1.673366
4,0.066111,-0.32473,-2.248367


In [139]:
dataset.del_row(1)

In [140]:
dataset.data

Unnamed: 0,x,y,z
0,1.206616,0.507058,-0.204432
1,1.653985,2.708242,1.677328
2,-0.733088,0.431928,-1.673366
3,0.066111,-0.32473,-2.248367


In [141]:
dataset.del_row([1, 2])


In [142]:
dataset.data

Unnamed: 0,x,y,z
0,1.206616,0.507058,-0.204432
1,0.066111,-0.32473,-2.248367
