In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../../data/models/df_cleaned.csv')
df.head(3)

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units,lat,lng
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,60.18852,-149.63156
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i,63.73049,-148.91968
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i,60.18879,-149.631


# Select features and one hot encode non-tag features

In [22]:
def one_hot_encode(df, trail_id, tag_features, non_tag_features, response):
    
    # Get column names of categorical and numerical variables
    df_features = df[non_tag_features]
    cat_names = df_features.select_dtypes(include='object').columns
    num_names = df_features.select_dtypes(include=np.number).columns

    # Encode categorical variables
    enc_columns = pd.get_dummies(df_features[cat_names], drop_first = True)
    enc_columns.columns = [i.replace(' ', '_') for i in enc_columns.columns]

    # Concatenate encoded columns to numerical columns, and tag features
    df_enc = pd.concat([df[trail_id], df_features[num_names], enc_columns, df[tag_features], df[response]], axis=1)
    
    return df_enc

# One hot encode tags and save unique tags
One hot encode `features` and `activities` columns.

In [23]:
def list_to_text(ls, path):
    textfile = open(path, "w")
    for element in ls:
        textfile.write(element + "\n")
    textfile.close()

In [24]:
def expand_column(df, column_name, path):
    """"""
    # Clean and split the elements by comma
    split_series = [i.strip('[]') for i in df[column_name]]
    split_series = pd.Series([i.replace("\'","").replace("-","_").split(', ') for i in split_series])
    
    # Save the column's unique values 
    column_list = list(split_series)
    flat_list = [item for sublist in column_list for item in sublist]
    flat_list = [x for x in flat_list if x != '']
    unique_val = np.unique(flat_list)
    list_to_text(unique_val, path)
    
    # Create dummy dataframe
    df_split = split_series.str.join('|').str.get_dummies()
    df_split.columns = column_name + '_' + df_split.columns

    # Join dummies with dataframe
    df_expand = df.join(df_split)

    # Drop original column
    df_expand = df_expand.drop(columns=column_name)

    return df_expand

# Create response variable for classification
Bin `difficulty_rating` feature into three classes.

In [25]:
def bin_feature(df, cut_labels, cut_bins, num_col_name, bin_col_name):
    """
    cut_labels (list)
    cut_bins (list)
    """
    df[bin_col_name] = pd.cut(df[num_col_name], bins=cut_bins, labels=cut_labels)
    df.drop(columns=num_col_name, inplace=True)
    
    return df

# Add features

In [26]:
trail_id = 'trail_id'
tag_features = ['features', 'activities']
non_tag_features = ['length', 'elevation_gain', 'route_type']
response = 'difficulty_rating'

cut_labels = ['easy', 'moderate', 'hard']
cut_bins = [0, 2, 3, 7]

In [27]:
# Select features and one hot encode non-tag categorical variables
df_ohe = one_hot_encode(df, trail_id, tag_features, non_tag_features, response)

# Create dummy variable for each tag
df_expand = expand_column(df_ohe, 'features', '../../data/models/features.txt')
df_expand = expand_column(df_expand, 'activities', '../../data/models/activities.txt')

# Create response variable
df_cleaned = bin_feature(df_expand, cut_labels, cut_bins, 'difficulty_rating', 'difficulty_class')

In [28]:
df_cleaned['difficulty_class'].value_counts()

moderate    1453
hard         975
easy         885
Name: difficulty_class, dtype: int64

In [29]:
df_cleaned.to_csv('../../data/models/df_featurize.csv', index=False)