In [109]:
import numpy as np
import pandas as pd
import pytest

In [5]:
df = pd.read_csv('../../data/models/df_cleaned.csv')
df.head(3)

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,8.87,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73049, 'lng': -148.91968}",18.0311,3.93,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,1.65,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i


# Select features and one hot encode non-tag features

In [121]:
def one_hot_encode(df, trail_id, tag_features, non_tag_features, response):
    # Get column names of categorical and numerical variables
    df_features = df[non_tag_features]
    cat_names = df_features.select_dtypes(include='object').columns
    num_names = df_features.select_dtypes(include=np.number).columns

    # Encode categorical variables
    enc_columns = pd.get_dummies(df_features[cat_names], drop_first=True)
    enc_columns.columns = [i.replace(' ', '_') for i in enc_columns.columns]

    # Concatenate encoded columns to numerical columns, and tag features
    df_enc = pd.concat([df[trail_id], df_features[num_names], enc_columns,
                        df[tag_features], df[response]], axis=1)

    return df_enc

# One hot encode tags and save unique tags
One hot encode `features` and `activities` columns.

In [122]:
def list_to_text(ls, path):
    textfile = open(path, "w")
    for element in ls:
        textfile.write(element + "\n")
    textfile.close()

In [123]:
def expand_column(df, column_name, path):
    """"""
    # Clean and split the elements by comma
    split_series = [i.strip('[]') for i in df[column_name]]
    split_series = pd.Series([i.replace("\'", "").replace("-", "_").split(', ') for i in split_series])

    # Save the column's unique values
    column_list = list(split_series)
    flat_list = [item for sublist in column_list for item in sublist]
    flat_list = [x for x in flat_list if x != '']
    unique_val = np.unique(flat_list)
    list_to_text(unique_val, path)

    # Create dummy dataframe
    df_split = split_series.str.join('|').str.get_dummies()
    df_split.columns = column_name + '_' + df_split.columns

    # Join dummies with dataframe
    df_expand = df.join(df_split)

    # Drop original column
    df_expand = df_expand.drop(columns=column_name)

    return df_expand

# Create response variable for classification
Bin `difficulty_rating` feature into three classes.

In [124]:
def bin_feature(df, cut_labels, cut_bins, num_col_name, bin_col_name):
    """
    cut_labels (list)
    cut_bins (list)
    """
    df[bin_col_name] = pd.cut(df[num_col_name], bins=cut_bins, labels=cut_labels)
    df.drop(columns=num_col_name, inplace=True)
    
    return df

# Add features

In [125]:
trail_id = 'trail_id'
tag_features = ['features', 'activities']
non_tag_features = ['length', 'elevation_gain', 'route_type']
response = 'difficulty_rating'

cut_labels = ['easy', 'moderate', 'hard']
cut_bins = [0, 2, 3, 7]

In [187]:
# Select features and one hot encode non-tag categorical variables
df_ohe = one_hot_encode(df, trail_id, tag_features, non_tag_features, response)

# Create dummy variable for each tag
df_expand = expand_column(df_ohe, 'features', '../../data/models/features.txt')
df_expand = expand_column(df_expand, 'activities', '../../data/models/activities.txt')

# Create response variable
#df_cleaned = bin_feature(df_expand, cut_labels, cut_bins, 'difficulty_rating', 'difficulty_class')

In [14]:
df_cleaned['difficulty_class'].value_counts()

moderate    1441
hard         973
easy         880
Name: difficulty_class, dtype: int64

In [29]:
df_cleaned.to_csv('../../data/models/df_featurize.csv', index=False)

# Create test input

In [261]:
data_subset = df.sample(3, random_state=0)
data_subset

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
1572,10037153,Waterton Valley Trail,Glacier National Park,Babb,Montana,United States,"{'lat': 48.95776, 'lng': -113.89222}",3.5294,18.47,1148.7912,7,out and back,2.0,4.5,3,"['dogs-no', 'forest', 'lake', 'river', 'wild-f...","['backpacking', 'camping', 'hiking', 'nature-t...",i
33,10265905,South Kaibab Trail to Ooh Aah Point,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05309, 'lng': -112.08387}",28.8685,1.65,210.9216,3,out and back,3.0,5.0,455,"['dogs-no', 'views', 'wildlife']","['birding', 'hiking', 'nature-trips', 'walking']",i
1475,10038226,Apgar Lookout Trail,Glacier National Park,Columbia Falls,Montana,United States,"{'lat': 48.50434, 'lng': -114.02082}",15.4183,6.49,566.928,5,out and back,3.0,4.0,220,"['dogs-no', 'forest', 'river', 'views', 'wild-...","['birding', 'hiking', 'nature-trips']",i


In [262]:
data_subset.values

array([[10037153, Waterton Valley Trail, Glacier National Park, Babb,
        Montana, United States, {'lat': 48.95776, 'lng': -113.89222},
        3.5294, 18.47, 1148.7912, 7, out and back, 2.0, 4.5, 3,
        ['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife'],
        ['backpacking', 'camping', 'hiking', 'nature-trips'], i],
       [10265905, South Kaibab Trail to Ooh Aah Point,
        Grand Canyon National Park, Grand Canyon, Arizona, United States,
        {'lat': 36.05309, 'lng': -112.08387}, 28.8685, 1.65, 210.9216, 3,
        out and back, 3.0, 5.0, 455, ['dogs-no', 'views', 'wildlife'],
        ['birding', 'hiking', 'nature-trips', 'walking'], i],
       [10038226, Apgar Lookout Trail, Glacier National Park,
        Columbia Falls, Montana, United States,
        {'lat': 48.50434, 'lng': -114.02082}, 15.4183, 6.49, 566.928, 5,
        out and back, 3.0, 4.0, 220,
        ['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife'],
        ['birding', '

In [263]:
data_subset.index

Int64Index([1572, 33, 1475], dtype='int64')

In [264]:
data_subset.columns

Index(['trail_id', 'name', 'area_name', 'city_name', 'state_name',
       'country_name', '_geoloc', 'popularity', 'length', 'elevation_gain',
       'difficulty_rating', 'route_type', 'visitor_usage', 'avg_rating',
       'num_reviews', 'features', 'activities', 'units'],
      dtype='object')

In [265]:
data_subset.index.name

In [266]:
data_subset.columns.name

In [267]:
df_in_values = [[10037153, 'Waterton Valley Trail', 'Glacier National Park',
        'Babb', 'Montana', 'United States',
        "{'lat': 48.95776, 'lng': -113.89222}", 3.5294, 18.47, 1148.7912,
        7, 'out and back', 2.0, 4.5, 3,
        "['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife']",
        "['backpacking', 'camping', 'hiking', 'nature-trips']", 'i'],
       [10265905, 'South Kaibab Trail to Ooh Aah Point',
        'Grand Canyon National Park', 'Grand Canyon', 'Arizona',
        'United States', "{'lat': 36.05309, 'lng': -112.08387}", 28.8685,
        1.65, 210.9216, 3, 'out and back', 3.0, 5.0, 455,
        "['dogs-no', 'views', 'wildlife']",
        "['birding', 'hiking', 'nature-trips', 'walking']", 'i'],
       [10038226, 'Apgar Lookout Trail', 'Glacier National Park',
        'Columbia Falls', 'Montana', 'United States',
        "{'lat': 48.50434, 'lng': -114.02082}", 15.4183, 6.49, 566.928,
        5, 'out and back', 3.0, 4.0, 220,
        "['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife']",
        "['birding', 'hiking', 'nature-trips']", 'i']]

df_in_index = [1572, 33, 1475]

df_in_columns = ['trail_id', 'name', 'area_name', 'city_name', 'state_name',
       'country_name', '_geoloc', 'popularity', 'length', 'elevation_gain',
       'difficulty_rating', 'route_type', 'visitor_usage', 'avg_rating',
       'num_reviews', 'features', 'activities', 'units']

df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)

In [268]:
df_in.equals(data_subset)

True

## Create test for `one_hot_encode`

In [86]:
df_out = one_hot_encode(df_in, 'trail_id', ['features', 'activities'], ['length', 'elevation_gain', 'route_type'], 'difficulty_rating')
df_out

Unnamed: 0,trail_id,length,elevation_gain,features,activities,difficulty_rating
1572,10037153,18.47,1148.7912,"['dogs-no', 'forest', 'lake', 'river', 'wild-f...","['backpacking', 'camping', 'hiking', 'nature-t...",7
33,10265905,1.65,210.9216,"['dogs-no', 'views', 'wildlife']","['birding', 'hiking', 'nature-trips', 'walking']",3
1475,10038226,6.49,566.928,"['dogs-no', 'forest', 'river', 'views', 'wild-...","['birding', 'hiking', 'nature-trips']",5


In [92]:
np.set_printoptions(formatter={'all':lambda x: str(x)})

In [98]:
df_out.values

array([[10037153, 18.47, 1148.7912,
        ['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife'],
        ['backpacking', 'camping', 'hiking', 'nature-trips'], 7],
       [10265905, 1.65, 210.9216, ['dogs-no', 'views', 'wildlife'],
        ['birding', 'hiking', 'nature-trips', 'walking'], 3],
       [10038226, 6.49, 566.928,
        ['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife'],
        ['birding', 'hiking', 'nature-trips'], 5]], dtype=object)

In [99]:
df_out.index

Int64Index([1572, 33, 1475], dtype='int64')

In [100]:
df_out.columns

Index(['trail_id', 'length', 'elevation_gain', 'features', 'activities',
       'difficulty_rating'],
      dtype='object')

In [101]:
df_out.columns.name

In [102]:
df_out.index.name

In [103]:
df_true = pd.DataFrame(
    [[10037153, 18.47, 1148.7912,
        "['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife']",
        "['backpacking', 'camping', 'hiking', 'nature-trips']", 7],
       [10265905, 1.65, 210.9216, "['dogs-no', 'views', 'wildlife']",
        "['birding', 'hiking', 'nature-trips', 'walking']", 3],
       [10038226, 6.49, 566.928,
        "['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife']",
        "['birding', 'hiking', 'nature-trips']", 5]],
index=[1572, 33, 1475],
columns=['trail_id', 'length', 'elevation_gain', 'features', 'activities',
       'difficulty_rating'])

In [104]:
df_true.equals(df_out)

True

In [105]:
def test_one_hot_encode():
    
    # Define input dataframe
    df_in_values = [[10037153, 'Waterton Valley Trail', 'Glacier National Park',
        'Babb', 'Montana', 'United States',
        "{'lat': 48.95776, 'lng': -113.89222}", 3.5294, 18.47, 1148.7912,
        7, 'out and back', 2.0, 4.5, 3,
        "['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife']",
        "['backpacking', 'camping', 'hiking', 'nature-trips']", 'i'],
       [10265905, 'South Kaibab Trail to Ooh Aah Point',
        'Grand Canyon National Park', 'Grand Canyon', 'Arizona',
        'United States', "{'lat': 36.05309, 'lng': -112.08387}", 28.8685,
        1.65, 210.9216, 3, 'out and back', 3.0, 5.0, 455,
        "['dogs-no', 'views', 'wildlife']",
        "['birding', 'hiking', 'nature-trips', 'walking']", 'i'],
       [10038226, 'Apgar Lookout Trail', 'Glacier National Park',
        'Columbia Falls', 'Montana', 'United States',
        "{'lat': 48.50434, 'lng': -114.02082}", 15.4183, 6.49, 566.928,
        5, 'out and back', 3.0, 4.0, 220,
        "['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife']",
        "['birding', 'hiking', 'nature-trips']", 'i']]
    df_in_index = [1572, 33, 1475]

    df_in_columns = ['trail_id', 'name', 'area_name', 'city_name', 'state_name',
           'country_name', '_geoloc', 'popularity', 'length', 'elevation_gain',
           'difficulty_rating', 'route_type', 'visitor_usage', 'avg_rating',
           'num_reviews', 'features', 'activities', 'units']

    df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)

    # Define expected output
    df_true = pd.DataFrame(
    [[10037153, 18.47, 1148.7912,
        "['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife']",
        "['backpacking', 'camping', 'hiking', 'nature-trips']", 7],
       [10265905, 1.65, 210.9216, "['dogs-no', 'views', 'wildlife']",
        "['birding', 'hiking', 'nature-trips', 'walking']", 3],
       [10038226, 6.49, 566.928,
        "['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife']",
        "['birding', 'hiking', 'nature-trips']", 5]],
    index=[1572, 33, 1475],
    columns=['trail_id', 'length', 'elevation_gain', 'features', 'activities',
           'difficulty_rating'])
    
    # Create test output
    df_test = one_hot_encode(df_in, 'trail_id', ['features', 'activities'], ['length', 'elevation_gain', 'route_type'], 'difficulty_rating')
    
    assert df_test.equals(df_true)

In [106]:
test_one_hot_encode()

Unhappy path: feature doesn't exsist

In [107]:
def test_one_hot_encode_key_error():
    # Define input dataframe
    df_in_values = [[10037153, 'Waterton Valley Trail', 'Glacier National Park',
        'Babb', 'Montana', 'United States',
        "{'lat': 48.95776, 'lng': -113.89222}", 3.5294, 18.47, 1148.7912,
        7, 'out and back', 2.0, 4.5, 3,
        "['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife']",
        "['backpacking', 'camping', 'hiking', 'nature-trips']", 'i'],
       [10265905, 'South Kaibab Trail to Ooh Aah Point',
        'Grand Canyon National Park', 'Grand Canyon', 'Arizona',
        'United States', "{'lat': 36.05309, 'lng': -112.08387}", 28.8685,
        1.65, 210.9216, 3, 'out and back', 3.0, 5.0, 455,
        "['dogs-no', 'views', 'wildlife']",
        "['birding', 'hiking', 'nature-trips', 'walking']", 'i'],
       [10038226, 'Apgar Lookout Trail', 'Glacier National Park',
        'Columbia Falls', 'Montana', 'United States',
        "{'lat': 48.50434, 'lng': -114.02082}", 15.4183, 6.49, 566.928,
        5, 'out and back', 3.0, 4.0, 220,
        "['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife']",
        "['birding', 'hiking', 'nature-trips']", 'i']]
    df_in_index = [1572, 33, 1475]
    df_in_columns = ['trail_id', 'name', 'area_name', 'city_name', 'state_name',
           'country_name', '_geoloc', 'popularity', 'length', 'elevation_gain',
           'difficulty_rating', 'route_type', 'visitor_usage', 'avg_rating',
           'num_reviews', 'features', 'activities', 'units']
    df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)
    
    # Define expected output
    with pytest.raises(KeyError):
        one_hot_encode(df_in, 'trail_id', ['features', 'activities'], ['length', 'elevation_gain', 'route_type'], 'error')

In [110]:
test_one_hot_encode_key_error()

Unhappy path: not a dataframe

In [114]:
def test_one_hot_encode_non_df():
    df_in = 'I am not a dataframe'
    
    with pytest.raises(TypeError):
        one_hot_encode(df_in, 'trail_id', ['features', 'activities'], ['length', 'elevation_gain', 'route_type'], 'difficulty_rating')

In [115]:
test_one_hot_encode_non_df()

## Create test for `bin_feature`

In [271]:
data_subset = df_expand.sample(3, random_state=0).astype(str)
data_subset

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,difficulty_rating,features_ada,features_beach,features_cave,features_city_walk,...,activities_rock_climbing,activities_scenic_driving,activities_sea_kayaking,activities_skiing,activities_snowboarding,activities_snowshoeing,activities_surfing,activities_trail_running,activities_walking,activities_whitewater_kayaking
1572,10037153,18.47,1148.7912,1,0,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,10265905,1.65,210.9216,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1475,10038226,6.49,566.928,1,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [272]:
data_subset.values

array([[10037153, 18.47, 1148.7912, 1, 0, 7, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [10265905, 1.65, 210.9216, 1, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [10038226, 6.49, 566.928, 1, 0, 5, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=object)

In [273]:
data_subset.index

Int64Index([1572, 33, 1475], dtype='int64')

In [274]:
data_subset.columns

Index(['trail_id', 'length', 'elevation_gain', 'route_type_out_and_back',
       'route_type_point_to_point', 'difficulty_rating', 'features_ada',
       'features_beach', 'features_cave', 'features_city_walk',
       'features_dogs', 'features_dogs_leash', 'features_dogs_no',
       'features_forest', 'features_historic_site', 'features_hot_springs',
       'features_kids', 'features_lake', 'features_partially_paved',
       'features_paved', 'features_rails_trails', 'features_river',
       'features_strollers', 'features_views', 'features_waterfall',
       'features_wild_flowers', 'features_wildlife', 'activities_backpacking',
       'activities_bike_touring', 'activities_birding', 'activities_camping',
       'activities_canoeing', 'activities_cross_country_skiing',
       'activities_fishing', 'activities_fly_fishing', 'activities_hiking',
       'activities_horseback_riding', 'activities_ice_climbing',
       'activities_mountain_biking', 'activities_nature_trips',
       'activ

In [245]:
data_subset.index.name

In [246]:
data_subset.columns.name

In [275]:
df_in_values = [[10037153, 18.47, 1148.7912, 1, 0, 7, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [10265905, 1.65, 210.9216, 1, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [10038226, 6.49, 566.928, 1, 0, 5, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
df_in_index = [1572, 33, 1475]
df_in_columns = ['trail_id', 'length', 'elevation_gain', 'route_type_out_and_back',
       'route_type_point_to_point', 'difficulty_rating', 'features_ada',
       'features_beach', 'features_cave', 'features_city_walk',
       'features_dogs', 'features_dogs_leash', 'features_dogs_no',
       'features_forest', 'features_historic_site', 'features_hot_springs',
       'features_kids', 'features_lake', 'features_partially_paved',
       'features_paved', 'features_rails_trails', 'features_river',
       'features_strollers', 'features_views', 'features_waterfall',
       'features_wild_flowers', 'features_wildlife', 'activities_backpacking',
       'activities_bike_touring', 'activities_birding', 'activities_camping',
       'activities_canoeing', 'activities_cross_country_skiing',
       'activities_fishing', 'activities_fly_fishing', 'activities_hiking',
       'activities_horseback_riding', 'activities_ice_climbing',
       'activities_mountain_biking', 'activities_nature_trips',
       'activities_off_road_driving', 'activities_paddle_sports',
       'activities_rails_trails', 'activities_road_biking',
       'activities_rock_climbing', 'activities_scenic_driving',
       'activities_sea_kayaking', 'activities_skiing',
       'activities_snowboarding', 'activities_snowshoeing',
       'activities_surfing', 'activities_trail_running', 'activities_walking',
       'activities_whitewater_kayaking']
df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)

In [276]:
df_in

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,difficulty_rating,features_ada,features_beach,features_cave,features_city_walk,...,activities_rock_climbing,activities_scenic_driving,activities_sea_kayaking,activities_skiing,activities_snowboarding,activities_snowshoeing,activities_surfing,activities_trail_running,activities_walking,activities_whitewater_kayaking
1572,10037153,18.47,1148.7912,1,0,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,10265905,1.65,210.9216,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1475,10038226,6.49,566.928,1,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [277]:
data_subset

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,difficulty_rating,features_ada,features_beach,features_cave,features_city_walk,...,activities_rock_climbing,activities_scenic_driving,activities_sea_kayaking,activities_skiing,activities_snowboarding,activities_snowshoeing,activities_surfing,activities_trail_running,activities_walking,activities_whitewater_kayaking
1572,10037153,18.47,1148.7912,1,0,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,10265905,1.65,210.9216,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1475,10038226,6.49,566.928,1,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [278]:
df_in.equals(data_subset)

False

In [279]:
df_out = bin_feature(df_in, ['easy', 'moderate', 'hard'], [0, 2, 3, 7], 'difficulty_rating', 'difficulty_class')
df_out

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,features_ada,features_beach,features_cave,features_city_walk,features_dogs,...,activities_scenic_driving,activities_sea_kayaking,activities_skiing,activities_snowboarding,activities_snowshoeing,activities_surfing,activities_trail_running,activities_walking,activities_whitewater_kayaking,difficulty_class
1572,10037153,18.47,1148.7912,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,hard
33,10265905,1.65,210.9216,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,moderate
1475,10038226,6.49,566.928,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,hard


In [280]:
df_out.values

array([[10037153, 18.47, 1148.7912, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, hard],
       [10265905, 1.65, 210.9216, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, moderate],
       [10038226, 6.49, 566.928, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, hard]], dtype=object)

In [281]:
df_out.index

Int64Index([1572, 33, 1475], dtype='int64')

In [282]:
df_out.columns

Index(['trail_id', 'length', 'elevation_gain', 'route_type_out_and_back',
       'route_type_point_to_point', 'features_ada', 'features_beach',
       'features_cave', 'features_city_walk', 'features_dogs',
       'features_dogs_leash', 'features_dogs_no', 'features_forest',
       'features_historic_site', 'features_hot_springs', 'features_kids',
       'features_lake', 'features_partially_paved', 'features_paved',
       'features_rails_trails', 'features_river', 'features_strollers',
       'features_views', 'features_waterfall', 'features_wild_flowers',
       'features_wildlife', 'activities_backpacking',
       'activities_bike_touring', 'activities_birding', 'activities_camping',
       'activities_canoeing', 'activities_cross_country_skiing',
       'activities_fishing', 'activities_fly_fishing', 'activities_hiking',
       'activities_horseback_riding', 'activities_ice_climbing',
       'activities_mountain_biking', 'activities_nature_trips',
       'activities_off_road_drivin

In [283]:
df_out.columns.name

In [284]:
df_out.index.name

In [288]:
df_true = pd.DataFrame(
    [[10037153, 18.47, 1148.7912, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, hard],
       [10265905, 1.65, 210.9216, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, moderate],
       [10038226, 6.49, 566.928, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, hard]],
index=[1572, 33, 1475],
columns=['trail_id', 'length', 'elevation_gain', 'route_type_out_and_back',
       'route_type_point_to_point', 'features_ada', 'features_beach',
       'features_cave', 'features_city_walk', 'features_dogs',
       'features_dogs_leash', 'features_dogs_no', 'features_forest',
       'features_historic_site', 'features_hot_springs', 'features_kids',
       'features_lake', 'features_partially_paved', 'features_paved',
       'features_rails_trails', 'features_river', 'features_strollers',
       'features_views', 'features_waterfall', 'features_wild_flowers',
       'features_wildlife', 'activities_backpacking',
       'activities_bike_touring', 'activities_birding', 'activities_camping',
       'activities_canoeing', 'activities_cross_country_skiing',
       'activities_fishing', 'activities_fly_fishing', 'activities_hiking',
       'activities_horseback_riding', 'activities_ice_climbing',
       'activities_mountain_biking', 'activities_nature_trips',
       'activities_off_road_driving', 'activities_paddle_sports',
       'activities_rails_trails', 'activities_road_biking',
       'activities_rock_climbing', 'activities_scenic_driving',
       'activities_sea_kayaking', 'activities_skiing',
       'activities_snowboarding', 'activities_snowshoeing',
       'activities_surfing', 'activities_trail_running', 'activities_walking',
       'activities_whitewater_kayaking', 'difficulty_class'])

In [289]:
df_true.equals(df_out)

False

## Create test for `expand_column`

In [150]:
data_subset = df_ohe.sample(3, random_state=0)
data_subset

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,features,activities,difficulty_rating
1572,10037153,18.47,1148.7912,1,0,"['dogs-no', 'forest', 'lake', 'river', 'wild-f...","['backpacking', 'camping', 'hiking', 'nature-t...",7
33,10265905,1.65,210.9216,1,0,"['dogs-no', 'views', 'wildlife']","['birding', 'hiking', 'nature-trips', 'walking']",3
1475,10038226,6.49,566.928,1,0,"['dogs-no', 'forest', 'river', 'views', 'wild-...","['birding', 'hiking', 'nature-trips']",5


In [151]:
data_subset.values

array([[10037153, 18.47, 1148.7912, 1, 0,
        ['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife'],
        ['backpacking', 'camping', 'hiking', 'nature-trips'], 7],
       [10265905, 1.65, 210.9216, 1, 0, ['dogs-no', 'views', 'wildlife'],
        ['birding', 'hiking', 'nature-trips', 'walking'], 3],
       [10038226, 6.49, 566.928, 1, 0,
        ['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife'],
        ['birding', 'hiking', 'nature-trips'], 5]], dtype=object)

In [152]:
data_subset.index

Int64Index([1572, 33, 1475], dtype='int64')

In [153]:
data_subset.columns

Index(['trail_id', 'length', 'elevation_gain', 'route_type_out_and_back',
       'route_type_point_to_point', 'features', 'activities',
       'difficulty_rating'],
      dtype='object')

In [154]:
data_subset.index.name

In [155]:
data_subset.columns.name

In [156]:
df_in_values = [[10037153, 18.47, 1148.7912, 1, 0,
        "['dogs-no', 'forest', 'lake', 'river', 'wild-flowers', 'wildlife']",
        "['backpacking', 'camping', 'hiking', 'nature-trips']", 7],
       [10265905, 1.65, 210.9216, 1, 0, "['dogs-no', 'views', 'wildlife']",
        "['birding', 'hiking', 'nature-trips', 'walking']", 3],
       [10038226, 6.49, 566.928, 1, 0,
        "['dogs-no', 'forest', 'river', 'views', 'wild-flowers', 'wildlife']",
        "['birding', 'hiking', 'nature-trips']", 5]]

df_in_index = [1572, 33, 1475]

df_in_columns = ['trail_id', 'length', 'elevation_gain', 'route_type_out_and_back',
       'route_type_point_to_point', 'features', 'activities',
       'difficulty_rating']

df_in = pd.DataFrame(df_in_values, index=df_in_index, columns=df_in_columns)

In [157]:
df_in == data_subset

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,features,activities,difficulty_rating
1572,True,True,True,True,True,True,True,True
33,True,True,True,True,True,True,True,True
1475,True,True,True,True,True,True,True,True


In [158]:
df_in.equals(data_subset)

False

In [163]:
df_out = expand_column(df_in, 'features', '../../data/models/features.txt')
df_out

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,activities,difficulty_rating,features_dogs_no,features_forest,features_lake,features_river,features_views,features_wild_flowers,features_wildlife
1572,10037153,18.47,1148.7912,1,0,"['backpacking', 'camping', 'hiking', 'nature-t...",7,,,,,,,
33,10265905,1.65,210.9216,1,0,"['birding', 'hiking', 'nature-trips', 'walking']",3,,,,,,,
1475,10038226,6.49,566.928,1,0,"['birding', 'hiking', 'nature-trips']",5,,,,,,,


In [162]:
df_out.values

array([[10037153, 18.47, 1148.7912, 1, 0,
        ['backpacking', 'camping', 'hiking', 'nature-trips'], 7, nan,
        nan, nan, nan, nan, nan, nan],
       [10265905, 1.65, 210.9216, 1, 0,
        ['birding', 'hiking', 'nature-trips', 'walking'], 3, nan, nan,
        nan, nan, nan, nan, nan],
       [10038226, 6.49, 566.928, 1, 0,
        ['birding', 'hiking', 'nature-trips'], 5, nan, nan, nan, nan,
        nan, nan, nan]], dtype=object)

In [159]:
expand_column(df_ohe, 'features', '../../data/models/features.txt')

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,activities,difficulty_rating,features_ada,features_beach,features_cave,...,features_lake,features_partially_paved,features_paved,features_rails_trails,features_river,features_strollers,features_views,features_waterfall,features_wild_flowers,features_wildlife
0,10020048,8.87,1161.8976,1,0,"['birding', 'camping', 'hiking', 'nature-trips...",5,0,0,0,...,0,0,0,0,1,0,1,1,1,1
1,10236086,3.93,507.7968,1,0,"['birding', 'camping', 'hiking', 'nature-trips...",3,0,0,0,...,0,0,0,0,0,0,1,0,1,1
2,10267857,1.65,81.9912,1,0,"['hiking', 'walking']",1,0,0,0,...,0,1,0,0,0,0,1,0,0,1
3,10236076,1.92,119.7864,0,0,"['birding', 'hiking', 'nature-trips', 'trail-r...",1,0,0,0,...,1,0,0,0,0,0,1,0,1,1
4,10236082,16.92,1124.7120,1,0,"['birding', 'fishing', 'hiking', 'nature-trips...",5,0,0,0,...,1,0,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3289,10236006,0.46,32.9184,1,0,"['hiking', 'walking']",1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3290,10008302,11.43,1105.8144,0,0,"['birding', 'hiking', 'nature-trips']",5,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3291,10236001,16.09,1171.9560,1,0,"['backpacking', 'camping', 'hiking']",5,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3292,10258707,0.18,3.9624,1,0,"['hiking', 'walking']",1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
