In [30]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import pickle

pd.set_option('display.max_columns', 100)

# web: gunicorn -w 4 -k uvicorn.workers.UvicornWorker App.app:app

In [31]:
data = pd.read_csv('/home/maksonvinicio/Documents/GitHub/Price-houses/Data/kc_house_data.csv')

In [32]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [33]:
columns = ['id', 'date', 'zipcode', 'lat', 'long']
data_clean = data.drop(columns=columns, axis=1)

In [34]:
data_clean.isna().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [35]:
X = data_clean.drop(columns=['price'])
y = data_clean['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
pipe = Pipeline([('lgbm', LGBMRegressor(n_estimators=100, random_state=42))])  

In [20]:
pipe.fit(X_train, y_train)

pipe.score(X_val, y_val)

0.7334086183664953

In [23]:
y_pred = pipe.predict(X_val)
print("MAE: ", mean_absolute_error(y_val, y_pred)) 
print('RMSE: ', mean_squared_error(y_val, y_pred) ** 0.5)
print('R2: ',  r2_score(y_val, y_pred))    

MAE:  121055.90791039284
RMSE:  196180.71546365684
R2:  0.7334086183664953


In [25]:
pickle.dump(pipe, open('/home/maksonvinicio/Documents/GitHub/Price-houses/Models/pipe.pkl', 'wb'))