In [0]:
import pandas as pd
import sklearn
import joblib
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# import the dataset - lets establish some baselines 
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vR6LzWx6lS1FOS0Fl2QFF1CJeNrhsH0MxLmqEp8vbYj0Z-zDRm5Xwu4PQkP9WbCSokITS4UwcF9hCQ3/pub?output=csv')

# for dealing with all the ugly column names. 
df.columns = (
    df.columns
    .str.replace(' - ', '_')
    .str.replace('/', '_')
    .str.replace(',', '_')
    .str.replace('.', '_')
    .str.replace('(', '_')
    .str.replace(')', '_')
    .str.replace(' ', '_')
    .str.replace('__', '_')
    .str.lower()
)

# make a train test val split 
# 80% 10% 10%
train, val, test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

def clean(X):

    # Prevent SettingWithCopyWarning
    X = X.copy()

    # drop the column that will leak data into the model 
    X = X.drop(columns='cost_of_living_plus_rent')

    
    cols_with_zeros = ['domestic_beer_0_5_liter_bottle_']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        X[col+'_MISSING'] = X[col].isnull()

    # fill nan with the mean of the columns
    X.fillna(X.mean(), inplace=True)

    # # return cleaned df
    return X

train = clean(train)
val = clean(val)
test = clean(test)  

# The COL Index is the target

# Get a dataframe with all train columns except the target
# train_features = train.drop(columns=[target])

# Get a list of the numeric features
# numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
# cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 60
# increased to 60 for this exercise 
# categorical_features = cardinality[cardinality <= 60].index.tolist()

# Combine the lists 
# features = numeric_features + categorical_features
# Arrange data into X features matrix and y target vector 

features = ['rent_index', 'restaurant_price_index', 'groceries_index', 'apartment_city_center_1bed', 'apartment_utilities', 'internet_monthly']
target = 'cost_of_living_index'

X_train = train[features]
y_train = train[target]

X_val = val[features]
y_val = val[target]

X_test = test[features]
y_test = test[target]

In [2]:
pipeline = make_pipeline(
    StandardScaler(), 
    LinearRegression()
)

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [3]:
from joblib import dump
dump(pipeline, 'pipeline.joblib', compress=True)

['pipeline.joblib']

In [4]:
print(f'pandas=={pd.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'joblib=={joblib.__version__}')
print(f'numpy=={np.__version__}')



pandas==0.25.3
scikit-learn==0.22.1
joblib==0.14.1
numpy==1.17.5


In [0]:
def predict (rent_index, restaurant_price_index, groceries_index, apartment_city_center_1bed, apartment_utilities, internet_monthly):
  df = pd.DataFrame(
      columns = ['rent_index', 'restaurant_price_index', 'groceries_index', 'apartment_city_center_1bed', 'apartment_utilities', 'internet_monthly'],
      data = [[rent_index, restaurant_price_index, groceries_index, apartment_city_center_1bed, apartment_utilities, internet_monthly]]
  )
  y_pred = pipeline.predict(df)[0]
  return f'Cost of Living {y_pred:.4f}'


In [7]:
predict (10, 15, 25, 450, 95, 100)

'Cost of Living 28.6015'