In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.compose import ColumnTransformer
from sklearn import set_config

In [2]:
df = pd.read_pickle('final_df')
df.head()

Unnamed: 0,longitude,Postal_Code,Unit_Type,Address,Rent,latitude,Home_Type,Bedroom_Num,Bathroom_Num,Area_SqFt,...,public_housing,subsidized_properties,building_age,racial_diversity_index,asian,black,hispanic,white,other_races,absolute_diff
0,-73.905143,11378.0,Apartment for rent,"6155 60th Pl, Maspeth, NY 11378",2650.0,40.719532,APARTMENT,3.0,1.0,1375.0,...,0.0,13.0,80.223531,0.69,0.338447,0.016259,0.33789,0.283209,0.024194,106.93
1,-73.7671,11361.0,House for rent,"217th St, Bayside, NY 11361",1250.0,40.76907,HOUSE- SINGLE FAMILY,0.0,1.0,550.0,...,0.0,0.0,72.417602,0.61,0.362974,0.017587,0.109906,0.477049,0.032484,712.32
2,-73.996217,10001.0,Apartment for rent,"247 W 26th St APT 5B, New York, NY 10001",2500.0,40.747067,HOUSE- MULTI FAMILY,1.0,1.0,500.0,...,2353.0,66.14,92.95986,0.56,0.135238,0.055244,0.144715,0.631512,0.033291,1045.57
3,-73.997605,10001.0,Apartment for rent,"358 W 30th St APT 2B, New York, NY 10001",2350.0,40.750397,HOUSE- MULTI FAMILY,0.0,1.0,,...,2353.0,66.14,92.95986,0.56,0.135238,0.055244,0.144715,0.631512,0.033291,1195.57
4,-74.001715,10001.0,Apartment for rent,"420 W 25th St APT 7K, New York, NY 10001",5900.0,40.748087,HOUSE- MULTI FAMILY,1.0,1.0,893.0,...,2353.0,66.14,92.95986,0.56,0.135238,0.055244,0.144715,0.631512,0.033291,2354.43


In [3]:
# REMOVING NON NUMERICAL NUMBERS

X = df.drop(columns = ['Address', 'Rent'])
y = df.Rent
X.shape, y.shape

((5011, 45), (5011,))

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5011 entries, 0 to 5146
Data columns (total 45 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   longitude                          5011 non-null   float64
 1   Postal_Code                        5011 non-null   float64
 2   Unit_Type                          5011 non-null   object 
 3   latitude                           5011 non-null   float64
 4   Home_Type                          5011 non-null   object 
 5   Bedroom_Num                        5011 non-null   float64
 6   Bathroom_Num                       4946 non-null   float64
 7   Area_SqFt                          2544 non-null   float64
 8   Neighborhood                       5011 non-null   object 
 9   boro_names                         5011 non-null   object 
 10  median_rent                        5011 non-null   float64
 11  born_in_new_york_state             5011 non-null   float

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=47)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3507, 45), (1504, 45), (3507,), (1504,))

In [7]:
set_config(display='diagram')

### Model 1 - Simple Linear Regression

In [8]:
numerical_columns = X.drop(columns = ['Unit_Type','Home_Type','Neighborhood','boro_names']).columns
categorical_columns = ['Unit_Type','Home_Type','Neighborhood','boro_names']

In [9]:
categorical_preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [10]:
preprocesser = ColumnTransformer(
    transformers = [
        ('numerical', categorical_preprocessor, categorical_columns),
        ('categorical', numerical_preprocessor, numerical_columns)])

In [11]:
model = make_pipeline(preprocesser,
                      LinearRegression())

In [12]:
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

In [14]:
mean_absolute_error(y_test, y_pred)

266.7682845744681

In [15]:
r2_score(y_test, y_pred) 

0.7877969808039117

### Model 2 - Ridge Regression

In [58]:
lasso_model = Pipeline(steps=[
    ('Preprocess', preprocesser),
    ('PCA', PCA()),
    ('Lasso Model', Lasso())])

In [59]:
n_features_to_test = np.arange(1, 11)
alpha_to_test = 2.0**np.arange(-6, +6)

In [60]:
params = {'reduce_dim__n_components': n_features_to_test, 
          'regressor__alpha': alpha_to_test}

In [61]:
lasso_model.fit(X_train, y_train) 

In [62]:
y_pred = lasso_model.predict(X_test)

In [63]:
mean_absolute_error(y_test, y_pred)

267.33166603358967

In [64]:
r2_score(y_test, y_pred)

0.7855582483251918

### Model 3 - Random Forest

In [44]:
random_forest = Pipeline(steps=[
    ('Preprocess', preprocesser),
    ('Random Forest', RandomForestRegressor(random_state=47))])

In [50]:
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=20)]
grid_params = {
        'randomforestregressor__n_estimators': n_est,
}

In [65]:
rf_grid_cv = GridSearchCV(random_forest, param_grid=grid_params, cv=5, n_jobs=-1) 

#### Next 
- XG Boost
- SVM
- Use Cross Validation