In [73]:
import pandas as pd
import requests as re
import numpy as np

In [74]:
cloud = re.get('https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/taylor/cloud.data')

In [75]:
open('clouds.data', 'wb').write(cloud.content)

209266

In [76]:
with open("clouds.data",'r') as f:
    data = [[s for s in line.split(' ') if s!=''] for line in f.readlines()]

In [77]:
columns = ['visible_mean', 'visible_max', 'visible_min', 
           'visible_mean_distribution', 'visible_contrast', 
           'visible_entropy', 'visible_second_angular_momentum', 
           'IR_mean', 'IR_max', 'IR_min']

first_cloud = data[53:1077]

first_cloud = [[float(s.replace('/n', '')) for s in cloud]
               for cloud in first_cloud]
first_cloud = pd.DataFrame(first_cloud, columns=columns)

first_cloud['class'] = np.zeros(len(first_cloud))

second_cloud = data[1082:2105]

second_cloud = [[float(s.replace('/n', '')) for s in cloud]
                for cloud in second_cloud]

second_cloud = pd.DataFrame(second_cloud, columns=columns)

second_cloud['class'] = np.ones(len(second_cloud))

data = pd.concat([first_cloud, second_cloud])

In [78]:
def additional_features(features, additional):
    for feature in additional:
        if feature == 'log_entropy':
            features['log_entropy'] = features.visible_entropy.apply(np.log)
        elif feature == 'entropy_x_contrast':
            features['entropy_x_contrast'] = features.visible_contrast.multiply(features.visible_entropy)
        elif feature == 'IR_range':
            features['IR_range'] = features.IR_max - features.IR_min
        elif feature == 'IR_norm_range':
            features['IR_norm_range'] = (features.IR_max - features.IR_min).divide(features.IR_mean)
        else:
            raise ValueError('This additional feature cannot be added')
    return features

In [79]:
data

Unnamed: 0,visible_mean,visible_max,visible_min,visible_mean_distribution,visible_contrast,visible_entropy,visible_second_angular_momentum,IR_mean,IR_max,IR_min,class
0,3.0000,140.0000,43.5000,0.0833,862.8417,0.0254,3.8890,163.0000,240.0000,213.3555,0.0
1,3.0000,135.0000,41.9063,0.0790,690.3291,0.0259,3.8340,167.0000,239.0000,213.7188,0.0
2,2.0000,126.0000,21.0586,0.0406,308.3583,0.0684,3.1702,174.0000,240.0000,227.5859,0.0
3,4.0000,197.0000,77.4805,0.0890,874.4709,0.0243,3.9442,155.0000,239.0000,197.2773,0.0
4,7.0000,193.0000,88.8398,0.0884,810.1126,0.0223,3.9318,150.0000,236.0000,186.0195,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1018,16.9689,58.6755,40.9075,3.0208,18.2875,0.1375,2.1754,-60.6180,20.7812,-16.1559,1.0
1019,6.0048,57.3856,23.0036,5.1917,48.4833,0.0828,2.6890,-24.0380,33.7879,19.0423,1.0
1020,5.3599,48.1413,13.9600,1.5125,9.3375,0.3567,1.5600,-78.0507,35.0668,14.7995,1.0
1021,7.9397,48.7863,20.7169,1.7500,7.8167,0.2291,1.7824,-79.7594,30.7739,-11.1662,1.0


In [80]:
data_subset = data.head(5)

In [81]:
data_subset.values

array([[3.0, 140.0, 43.5, 0.0833, 862.8417, 0.0254, 3.889, 163.0, 240.0,
        213.3555, 0.0],
       [3.0, 135.0, 41.9063, 0.079, 690.3291, 0.0259, 3.834, 167.0,
        239.0, 213.7188, 0.0],
       [2.0, 126.0, 21.0586, 0.0406, 308.3583, 0.0684, 3.1702, 174.0,
        240.0, 227.5859, 0.0],
       [4.0, 197.0, 77.4805, 0.089, 874.4709, 0.0243, 3.9442, 155.0,
        239.0, 197.2773, 0.0],
       [7.0, 193.0, 88.8398, 0.0884, 810.1126, 0.0223, 3.9318, 150.0,
        236.0, 186.0195, 0.0]])

In [82]:
data_subset.index

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [83]:
data_subset.columns

Index(['visible_mean', 'visible_max', 'visible_min',
       'visible_mean_distribution', 'visible_contrast', 'visible_entropy',
       'visible_second_angular_momentum', 'IR_mean', 'IR_max', 'IR_min',
       'class'],
      dtype='object')

In [84]:
data_subset.index.name

In [85]:
data_subset.columns.name

In [86]:
df_in_values = [[3.000000e+00, 1.400000e+02, 4.350000e+01, 8.330000e-02,
        8.628417e+02, 2.540000e-02, 3.889000e+00, 1.630000e+02,
        2.400000e+02, 2.133555e+02, 0.000000e+00],
       [3.000000e+00, 1.350000e+02, 4.190630e+01, 7.900000e-02,
        6.903291e+02, 2.590000e-02, 3.834000e+00, 1.670000e+02,
        2.390000e+02, 2.137188e+02, 0.000000e+00],
       [2.000000e+00, 1.260000e+02, 2.105860e+01, 4.060000e-02,
        3.083583e+02, 6.840000e-02, 3.170200e+00, 1.740000e+02,
        2.400000e+02, 2.275859e+02, 0.000000e+00],
       [4.000000e+00, 1.970000e+02, 7.748050e+01, 8.900000e-02,
        8.744709e+02, 2.430000e-02, 3.944200e+00, 1.550000e+02,
        2.390000e+02, 1.972773e+02, 0.000000e+00],
       [7.000000e+00, 1.930000e+02, 8.883980e+01, 8.840000e-02,
        8.101126e+02, 2.230000e-02, 3.931800e+00, 1.500000e+02,
        2.360000e+02, 1.860195e+02, 0.000000e+00]]

df_in_index = [0, 1, 2, 3, 4]

df_in_columns = ['visible_mean', 'visible_max', 'visible_min',
       'visible_mean_distribution', 'visible_contrast', 'visible_entropy',
       'visible_second_angular_momentum', 'IR_mean', 'IR_max', 'IR_min',
       'class']

df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)

In [87]:
df_in.equals(data_subset)

True

In [88]:
df_out = additional_features(df_in, ['log_entropy', 'entropy_x_contrast', 'IR_range', 'IR_norm_range'])
df_out

Unnamed: 0,visible_mean,visible_max,visible_min,visible_mean_distribution,visible_contrast,visible_entropy,visible_second_angular_momentum,IR_mean,IR_max,IR_min,class,log_entropy,entropy_x_contrast,IR_range,IR_norm_range
0,3.0,140.0,43.5,0.0833,862.8417,0.0254,3.889,163.0,240.0,213.3555,0.0,-3.673006,21.916179,26.6445,0.163463
1,3.0,135.0,41.9063,0.079,690.3291,0.0259,3.834,167.0,239.0,213.7188,0.0,-3.653512,17.879524,25.2812,0.151384
2,2.0,126.0,21.0586,0.0406,308.3583,0.0684,3.1702,174.0,240.0,227.5859,0.0,-2.682382,21.091708,12.4141,0.071345
3,4.0,197.0,77.4805,0.089,874.4709,0.0243,3.9442,155.0,239.0,197.2773,0.0,-3.717279,21.249643,41.7227,0.269179
4,7.0,193.0,88.8398,0.0884,810.1126,0.0223,3.9318,150.0,236.0,186.0195,0.0,-3.803169,18.065511,49.9805,0.333203


In [89]:
df_true = pd.DataFrame(
    [[3.000000e+00, 1.400000e+02, 4.350000e+01, 8.330000e-02,
        8.628417e+02, 2.540000e-02, 3.889000e+00, 1.630000e+02,
        2.400000e+02, 2.133555e+02, 0.000000e+00, np.log(2.540000e-02), 8.628417e+02*2.540000e-02, 2.400000e+02-2.133555e+02, (2.400000e+02-2.133555e+02)/1.630000e+02],
       [3.000000e+00, 1.350000e+02, 4.190630e+01, 7.900000e-02,
        6.903291e+02, 2.590000e-02, 3.834000e+00, 1.670000e+02,
        2.390000e+02, 2.137188e+02, 0.000000e+00, np.log(2.590000e-02), 6.903291e+02*2.590000e-02, 2.390000e+02-2.137188e+02, (2.390000e+02-2.137188e+02)/1.670000e+02],
       [2.000000e+00, 1.260000e+02, 2.105860e+01, 4.060000e-02,
        3.083583e+02, 6.840000e-02, 3.170200e+00, 1.740000e+02,
        2.400000e+02, 2.275859e+02, 0.000000e+00, np.log(6.840000e-02), 3.083583e+02*6.840000e-02, 2.400000e+02-2.275859e+02, (2.400000e+02-2.275859e+02)/1.740000e+02],
       [4.000000e+00, 1.970000e+02, 7.748050e+01, 8.900000e-02,
        8.744709e+02, 2.430000e-02, 3.944200e+00, 1.550000e+02,
        2.390000e+02, 1.972773e+02, 0.000000e+00, np.log(2.430000e-02), 8.744709e+02*2.430000e-02, 2.390000e+02-1.972773e+02, (2.390000e+02-1.972773e+02)/1.550000e+02],
       [7.000000e+00, 1.930000e+02, 8.883980e+01, 8.840000e-02,
        8.101126e+02, 2.230000e-02, 3.931800e+00, 1.500000e+02,
        2.360000e+02, 1.860195e+02, 0.000000e+00, np.log(2.230000e-02), 8.101126e+02*2.230000e-02, 2.360000e+02-1.860195e+02, (2.360000e+02-1.860195e+02)/1.500000e+02]],
index=[0, 1, 2, 3, 4],
columns=['visible_mean', 'visible_max', 'visible_min',
       'visible_mean_distribution', 'visible_contrast', 'visible_entropy',
       'visible_second_angular_momentum', 'IR_mean', 'IR_max', 'IR_min',
       'class', 'log_entropy', 'entropy_x_contrast', 'IR_range',
       'IR_norm_range'])

In [90]:
df_true.equals(df_out)

True

In [91]:
def test_additional_features():
    df_in_values = [[3.000000e+00, 1.400000e+02, 4.350000e+01, 8.330000e-02,
        8.628417e+02, 2.540000e-02, 3.889000e+00, 1.630000e+02,
        2.400000e+02, 2.133555e+02, 0.000000e+00],
       [3.000000e+00, 1.350000e+02, 4.190630e+01, 7.900000e-02,
        6.903291e+02, 2.590000e-02, 3.834000e+00, 1.670000e+02,
        2.390000e+02, 2.137188e+02, 0.000000e+00],
       [2.000000e+00, 1.260000e+02, 2.105860e+01, 4.060000e-02,
        3.083583e+02, 6.840000e-02, 3.170200e+00, 1.740000e+02,
        2.400000e+02, 2.275859e+02, 0.000000e+00],
       [4.000000e+00, 1.970000e+02, 7.748050e+01, 8.900000e-02,
        8.744709e+02, 2.430000e-02, 3.944200e+00, 1.550000e+02,
        2.390000e+02, 1.972773e+02, 0.000000e+00],
       [7.000000e+00, 1.930000e+02, 8.883980e+01, 8.840000e-02,
        8.101126e+02, 2.230000e-02, 3.931800e+00, 1.500000e+02,
        2.360000e+02, 1.860195e+02, 0.000000e+00]]

    df_in_index = [0, 1, 2, 3, 4]

    df_in_columns = ['visible_mean', 'visible_max', 'visible_min',
       'visible_mean_distribution', 'visible_contrast', 'visible_entropy',
       'visible_second_angular_momentum', 'IR_mean', 'IR_max', 'IR_min',
       'class']

    df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)
    
    df_true = pd.DataFrame(
    [[3.000000e+00, 1.400000e+02, 4.350000e+01, 8.330000e-02,
        8.628417e+02, 2.540000e-02, 3.889000e+00, 1.630000e+02,
        2.400000e+02, 2.133555e+02, 0.000000e+00, np.log(2.540000e-02), 8.628417e+02*2.540000e-02, 2.400000e+02-2.133555e+02, (2.400000e+02-2.133555e+02)/1.630000e+02],
       [3.000000e+00, 1.350000e+02, 4.190630e+01, 7.900000e-02,
        6.903291e+02, 2.590000e-02, 3.834000e+00, 1.670000e+02,
        2.390000e+02, 2.137188e+02, 0.000000e+00, np.log(2.590000e-02), 6.903291e+02*2.590000e-02, 2.390000e+02-2.137188e+02, (2.390000e+02-2.137188e+02)/1.670000e+02],
       [2.000000e+00, 1.260000e+02, 2.105860e+01, 4.060000e-02,
        3.083583e+02, 6.840000e-02, 3.170200e+00, 1.740000e+02,
        2.400000e+02, 2.275859e+02, 0.000000e+00, np.log(6.840000e-02), 3.083583e+02*6.840000e-02, 2.400000e+02-2.275859e+02, (2.400000e+02-2.275859e+02)/1.740000e+02],
       [4.000000e+00, 1.970000e+02, 7.748050e+01, 8.900000e-02,
        8.744709e+02, 2.430000e-02, 3.944200e+00, 1.550000e+02,
        2.390000e+02, 1.972773e+02, 0.000000e+00, np.log(2.430000e-02), 8.744709e+02*2.430000e-02, 2.390000e+02-1.972773e+02, (2.390000e+02-1.972773e+02)/1.550000e+02],
       [7.000000e+00, 1.930000e+02, 8.883980e+01, 8.840000e-02,
        8.101126e+02, 2.230000e-02, 3.931800e+00, 1.500000e+02,
        2.360000e+02, 1.860195e+02, 0.000000e+00, np.log(2.230000e-02), 8.101126e+02*2.230000e-02, 2.360000e+02-1.860195e+02, (2.360000e+02-1.860195e+02)/1.500000e+02]],
    index=[0, 1, 2, 3, 4],
    columns=['visible_mean', 'visible_max', 'visible_min',
       'visible_mean_distribution', 'visible_contrast', 'visible_entropy',
       'visible_second_angular_momentum', 'IR_mean', 'IR_max', 'IR_min',
       'class', 'log_entropy', 'entropy_x_contrast', 'IR_range',
       'IR_norm_range'])
    
    df_test = additional_features(df_in, ['log_entropy', 'entropy_x_contrast', 'IR_range', 'IR_norm_range'])
     
    assert df_test.equals(df_true)

In [92]:
def test_additional_features_unhappy():
    df_in_values = [[3.000000e+00, 1.400000e+02, 4.350000e+01, 8.330000e-02,
                     8.628417e+02, 2.540000e-02, 3.889000e+00, 1.630000e+02,
                     2.400000e+02, 2.133555e+02, 0.000000e+00],
                    [3.000000e+00, 1.350000e+02, 4.190630e+01, 7.900000e-02,
                     6.903291e+02, 2.590000e-02, 3.834000e+00, 1.670000e+02,
                     2.390000e+02, 2.137188e+02, 0.000000e+00],
                    [2.000000e+00, 1.260000e+02, 2.105860e+01, 4.060000e-02,
                     3.083583e+02, 6.840000e-02, 3.170200e+00, 1.740000e+02,
                     2.400000e+02, 2.275859e+02, 0.000000e+00],
                    [4.000000e+00, 1.970000e+02, 7.748050e+01, 8.900000e-02,
                     8.744709e+02, 2.430000e-02, 3.944200e+00, 1.550000e+02,
                     2.390000e+02, 1.972773e+02, 0.000000e+00],
                    [7.000000e+00, 1.930000e+02, 8.883980e+01, 8.840000e-02,
                     8.101126e+02, 2.230000e-02, 3.931800e+00, 1.500000e+02,
                     2.360000e+02, 1.860195e+02, 0.000000e+00]]

    df_in_index = [0, 1, 2, 3, 4]

    df_in_columns = ['visible_mean', 'visible_max', 'visible_min',
                     'visible_mean_distribution', 'visible_contrast', 'visible_entropy',
                     'visible_second_angular_momentum', 'IR_mean', 'IR_max', 'IR_min',
                     'class']

    df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)

    with pytest.raises(ValueError):
        additional_features(df_in, ['wrong_feature'])