In [379]:
import pandas as pd
from sklearn_pandas import DataFrameMapper

In [380]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [467]:
df = pd.read_csv('./listings_summary.csv.zip')

In [468]:
target = 'price'

### Clean Target

In [469]:
from coerce_target import convert_to_num, remove_zeros_in_target, drop_other_price_cols

In [470]:
price = convert_to_num(df[target])

In [471]:
df_coerced_price = df.assign(price = price)

In [472]:
pruned_df = remove_zeros_in_target(df_coerced_price, target)

In [473]:
log_price = np.log(pruned_df['price'])

In [474]:
df_log_price = pruned_df.assign(log_price = log_price)

In [475]:
clipped_df = df_log_price[df_log_price['log_price'] < 6]

In [481]:
# pruned_df.shape

In [482]:
# clipped_df.shape

In [483]:
removed_price_cols_df = drop_other_price_cols(clipped_df).drop(columns = ['price', 'log_price'])

In [484]:
removed_price_cols_df.shape

(22441, 93)

In [489]:
y = clipped_df['price']


log_y = clipped_df['log_price']
log_y.shape, y.shape

((22441,), (22441,))

### Remove All Same

In [490]:
from remove_all_same import find_all_same, drop_same_cols, same_cols

In [491]:
all_same_cols = find_all_same(removed_price_cols_df)

In [492]:
df_removed_all_same = drop_same_cols(all_same_cols, removed_price_cols_df)

### Coerce Num Cols

In [493]:
from convert_nums import from_price_and_percent_steps, find_almost_num_cols

In [494]:
almost_num_cols = find_almost_num_cols(df_removed_all_same)

In [495]:
percent_cols = almost_num_cols[:1]
price_cols = almost_num_cols[1:]
price_percent_steps = from_price_and_percent_steps(price_cols, percent_cols)

In [496]:
to_number_mapper = DataFrameMapper(price_percent_steps, df_out = True)

In [497]:
converted_nums_df = to_number_mapper.fit_transform(df_removed_all_same)

In [498]:
df_removed_all_same[converted_nums_df.columns] = converted_nums_df

### Convert Booleans

In [499]:
from convert_bools import boolean_mapper, top_val_bool_mapper, cols_to_remove

In [500]:
coerced_num_df = df_removed_all_same.copy()
true_bool_df = boolean_mapper.fit_transform(coerced_num_df)

In [501]:
converted_bools_df = df_removed_all_same.copy()
converted_bools_df[true_bool_df.columns] = true_bool_df

In [502]:
almost_bools_coerced = top_val_bool_mapper.fit_transform(converted_bools_df)

In [503]:
almost_bools_df = converted_bools_df.copy()
almost_bools_df[almost_bools_coerced.columns] = almost_bools_coerced

In [504]:
all_bools_coerced_df = almost_bools_df.drop(columns = cols_to_remove)

### Coerce Categories

In [505]:
from convert_cats import cat_cols, reduce_cat_df, one_hot_steps

In [506]:
reduced_cat_df = reduce_cat_df(all_bools_coerced_df, cat_cols)

In [507]:
one_hot_mapper = DataFrameMapper(one_hot_steps, df_out = True)

In [508]:
cat_one_hot_df = one_hot_mapper.fit_transform(all_bools_coerced_df)

In [509]:
cat_df = all_bools_coerced_df.drop(columns = cat_cols)
cat_df[cat_one_hot_df.columns] = cat_one_hot_df

### Coerce DateTimes

In [510]:
from coerce_date_lib import to_date_cols, add_datepart, date_cols

In [511]:
date_cols_df = to_date_cols(cat_df, date_cols)

In [512]:
[add_datepart(date_cols_df, col) for col in date_cols_df.columns]

[None, None, None, None, None]

In [513]:
date_cols_df[:2]

Unnamed: 0,last_scrapedYear,last_scrapedMonth,last_scrapedWeek,last_scrapedDay,last_scrapedDayofweek,last_scrapedDayofyear,last_scrapedIs_month_end,last_scrapedIs_month_start,last_scrapedIs_quarter_end,last_scrapedIs_quarter_start,...,last_reviewDay,last_reviewDayofweek,last_reviewDayofyear,last_reviewIs_month_end,last_reviewIs_month_start,last_reviewIs_quarter_end,last_reviewIs_quarter_start,last_reviewIs_year_end,last_reviewIs_year_start,last_reviewElapsed
0,2018,11,45,7,2,311,False,False,False,False,...,28.0,6.0,301.0,False,False,False,False,False,False,1540684800
1,2018,11,45,7,2,311,False,False,False,False,...,1.0,0.0,274.0,False,True,False,True,False,False,1538352000


In [514]:

replaced_dt_df = cat_df.drop(columns = ['last_scraped', 'host_since', 
'calendar_last_scraped', 'first_review','last_review'])

In [515]:
replaced_dt_df[date_cols_df.columns] = date_cols_df

### Replace Null Values

In [516]:
from handle_na_vals import build_null_mapper, has_na_cols

In [517]:
cols_with_na = has_na_cols(replaced_dt_df)

is_null_mapper = build_null_mapper(replaced_dt_df, cols_with_na)

In [518]:
dt_transformed_df = is_null_mapper.fit_transform(replaced_dt_df)


In [519]:
dt_transformed_df.isna().any(axis = 0).any()

False

In [520]:
df_with_is_na = replaced_dt_df.drop(columns = cols_with_na)

In [521]:
replaced_dt_df[dt_transformed_df.columns] = dt_transformed_df

In [522]:
dt_transformed_df.isna().any(axis = 0).any()

False

### Split and Train

In [523]:
log_y = np.log(y)

In [546]:
from sklearn.model_selection import train_test_split

In [547]:
X_train, X_test, y_train, y_test = train_test_split(dt_transformed_df, y, random_state = 1, test_size = .4)

In [548]:
X_validate, X_test, y_validate, y_test = train_test_split(X_test, y_test, random_state = 1, test_size = .5)

In [549]:
X_train.shape, X_test.shape, X_validate.shape

((13464, 70), (4489, 70), (4488, 70))

In [574]:
from sklearn.linear_model import LinearRegression

In [575]:
model = LinearRegression()

In [576]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [577]:
model.score(X_validate, y_validate)

0.34788054943396907

### Permutations

In [533]:
from eli5.sklearn import PermutationImportance
import eli5
import numpy as np

perm = PermutationImportance(model).fit(X_validate, np.log(y_validate))

exp_df = eli5.explain_weights_df(perm, feature_names = list(X_train.columns))

In [599]:
top_cols = exp_df[:75]

In [600]:
top_feats = top_cols['feature'].values

In [601]:
model = LinearRegression()
model.fit(X_train[top_feats], y_train)
model.score(X_validate[top_feats], y_validate)

0.34788054943397073

### Cols to Examine

* amenities and license