### BOSTON HOUSE PRICES - IRONHACK COMPETITION

In [1]:
import pandas as pd
import numpy as np
import timeit
import pickle
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

#### NOTES:
1. Call your transformer file 'tranformer.pkl', your encoder file 'encoder.pkl' and your model 'model.pkl'.
2. This is just the skeleton, you'll need to adjust it accordingly. Watch out! Check if the order of transformed data and encoded data is correct, you don't want to concatenate stuff out of order.
3. Test your function on a couple of rows to make sure everything runs fine and that you have the same predictions when you test it in your original notebook.
4. Deliver a folder with your name and, inside the folder, this notebook with your changes and pickle files with model, transformer (if you have it) and encoder.

#### Your code Here:

In [2]:
def data_cleaner(data):
    """
    This function will take in a pandas dataframe,
    apply cleaning and return the cleaned data.
    """
    # your cleaning steps here #
    data.columns = [column.replace(' ', '') for column in data.columns]
    data.drop(columns=['id', 'index'], inplace=True)
    data['data']=data['data'].astype('datetime64')
    data['waterfront'] = data['waterfront'].astype('object')
    data['view'] = data['view'].astype('object')
    data['zip_code'] = data['zip_code'].astype('object')
    data.drop(columns=['lon','lat', 'data', 'condition'], inplace=True)
    # Condition is dropped because it's redundant with 'grade'
    print(data.columns)

    return data

#### Thre rest is pretty much done, but might need some twiking from your side:

In [3]:
def data_transformer(data):
    """
    This function will take in cleaned data, already with
    the correct types and apply the transformations.
    """
    # if you did not scale your y, it's likely that the split can be done here
    X = data.drop('price', axis=1) # assuming you named your target variable as 'price'
    y = data['price']
    
    numericals = X.select_dtypes(np.number).to_numpy()
    categoricals = X.select_dtypes(np.object)
    
    num_col_names = X.select_dtypes(np.number).columns
    cat_col_names = X.select_dtypes(np.object).columns
    
    try:
        with open('transformer.pkl', 'rb') as file: 
            transformer = pickle.load(file)
            
        numericals = transformer.transform(numericals)
    
    except:
        print("Transformation didn't work or transformer was not found!")

    try:
        with open('encoder.pkl', 'rb') as file: 
            encoder = pickle.load(file)
        
        categoricals = encoder.transform(categoricals)
    
    except:
        print("Encoding didn't work or encoder was not found!")
        
    # make sure the concatenation is in the same order as the one you used to train your model
    X_transformed = np.concatenate([numericals, categoricals], axis=1)
    
    X_transformed = pd.DataFrame(X_transformed)
    X_transformed.columns = num_col_names.append(cat_col_names)
    
    ###
    # if you have scaled y, you will do the X-y split here:
    ###
    return X_transformed, y


def validation(X, y):
    
    with open('model.pkl', 'rb') as file: 
        model = pickle.load(file) 
        
    try:
        y_pred = model.predict(X)
    except:
        y_pred = model.predict(X.to_numpy())
    
    try:
        mae = round(mean_absolute_error(y, y_pred), 2)
        rmse = round(mean_squared_error(y, y_pred, squared=False), 2)
        r2 = round(r2_score(y, y_pred), 2)
    except:
        mae = round(mean_absolute_error(y.to_array(), y_pred), 2)
        rmse = round(mean_squared_error(y.to_array(), y_pred, squared=False), 2)
        r2 = round(r2_score(y.to_array(), y_pred), 2)
    
    print('--------------')
    print('MAE:', mae)
    print('RMSE:', rmse)
    print('R2 Score:', r2)
    
    return y_pred


def get_predictions(data):
    cleaned_data = data_cleaner(data)
    transformed_data, y = data_transformer(cleaned_data)
    predictions = validation(transformed_data, y)
    return predictions

### Evaluation & Predictions

In [4]:
validation_data = pd.read_csv('train_boston.csv') 

In [5]:
predictions = get_predictions(validation_data)

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'grade', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'zip_code', 'sqft_living15', 'sqft_lot15',
       'price'],
      dtype='object')
Transformation didn't work or transformer was not found!


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 0 dimension(s)