In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor

In [3]:
train_df = pd.read_csv('Datasets/SE_rents2018_train.csv', index_col=0)
test_df = pd.read_csv('Datasets/SE_rents2018_test1.csv', index_col=0)
submit1_df = pd.read_csv('Datasets/SE_rents2018_test2.csv', index_col=0)
# submit2_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test3.csv', index_col=0)

In [4]:
submit1_df.describe()

Unnamed: 0,building_id,bedrooms,bathrooms,size_sqft,addr_zip,addr_lat,addr_lon,bin,bbl,floor_count,...,has_washer_dryer,has_garage,has_roofdeck,has_concierge,has_pool,has_garden,has_childrens_playroom,rent,no_fee,floornumber
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,0.0,2000.0,1737.0
mean,1688109.0,1.688,1.2265,833.983,10684.9935,40.726794,-73.957842,2361318.0,2278957000.0,9.7909,...,0.2705,0.1625,0.2545,0.197,0.0725,0.1385,0.0885,,0.525,6.048647
std,4611790.0,1.094648,0.509242,492.945515,589.143729,0.056451,0.042068,1223083.0,1181451000.0,11.957536,...,0.444329,0.369001,0.435689,0.397832,0.259379,0.34551,0.284092,,0.499499,7.612185
min,178.0,0.0,0.0,0.0,10001.0,40.575849,-74.166298,1000000.0,1000158000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
25%,64915.75,1.0,1.0,600.0,10023.0,40.690963,-73.985497,1055487.0,1011630000.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2.0
50%,260830.0,2.0,1.0,800.0,11103.0,40.726669,-73.961599,3021102.0,3009320000.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,3.0
75%,727756.8,2.0,1.0,1000.0,11217.0,40.766491,-73.938455,3200882.0,3049853000.0,9.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,,1.0,6.0
max,18772160.0,6.0,5.0,5139.0,11693.0,40.909842,-73.750141,5166556.0,5016020000.0,90.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,63.0


In [5]:
print("Rent is Null: ",submit1_df['rent'].isnull().sum())
print("size_sqft is 0: ",(submit1_df['size_sqft']==0).sum())
print("Doesnt have a bed rooms: ",(submit1_df['bedrooms']==0).sum())
print("Doesnt have bathroom: ",(submit1_df['bathrooms']==0).sum())

Rent is Null:  2000
size_sqft is 0:  177
Doesnt have a bed rooms:  253
Doesnt have bathroom:  1


In [6]:
print("Null value in data set: ", train_df.isnull().sum())
print("\n\nNumber of data: ", train_df.shape)
print("Rent is Null: ",train_df['rent'].isnull().sum())
print("size_sqft is 0: ",(train_df['size_sqft']==0).sum())
print("Doesnt have a bed rooms: ",(train_df['bedrooms']==0).sum())
print("Doesnt have bathroom: ",(train_df['bathrooms']==0).sum())

Null value in data set:  addr_unit                   91
building_id                  0
bedrooms                     0
bathrooms                    0
size_sqft                    0
created_at                   0
addr_street                  0
addr_city                    0
addr_zip                     0
addr_lat                     0
addr_lon                     0
bin                          1
bbl                          0
floor_count                  0
year_built                 403
min_to_subway              126
has_doorman                  0
has_elevator                 0
has_fireplace                0
has_dishwasher               0
is_furnished                 0
has_gym                      0
allows_pets                  0
has_washer_dryer             0
has_garage                   0
has_roofdeck                 0
has_concierge                0
has_pool                     0
has_garden                   0
has_childrens_playroom       0
rent                         0
no_fee        

### Original Test Set Without Modification
#### Feature:
1. bedrooms
2. year_built
3. bathrooms
4. min_to_subway
5. size_sqft
6. no_fee
7. has_doorman

In [7]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



2701143.227707072

### Impute "size_sqrt" with mean
#### Feature:
1. bedrooms
2. year_built
3. bathrooms
4. min_to_subway
5. size_sqft
6. no_fee
7. has_doorman

In [8]:
train_df['size_sqft'].replace(0, np.nan, inplace= True)

mean_value=train_df['size_sqft'].mean()
train_df['size_sqft']=train_df['size_sqft'].fillna(mean_value)

In [9]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



2449004.457535808

## Impute "min_to_subway" with mean + imputed "size_sqrt"
#### Feature:
1. bedrooms
2. year_built
3. bathrooms
4. min_to_subway
5. size_sqft
6. no_fee
7. has_doorman

In [10]:
#Impute "min_to_subway"
train_df['min_to_subway'].replace(0, np.nan, inplace= True)
mean_value=train_df['min_to_subway'].mean()
train_df['min_to_subway']=train_df['min_to_subway'].fillna(mean_value)

In [11]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



2520210.2518225485

### Add "floornumber" feature +imputed "min_to_subway" and "size_sqft"
#### Feature:
1. bedrooms
2. year_built
3. bathrooms
4. min_to_subway
5. size_sqft
6. no_fee
7. has_doorman
8. floornumber

In [12]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman', 'floornumber'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



2285899.437293435

In [13]:
#Impute "floornumber"
train_df['floornumber'].replace(0, np.nan, inplace= True)
mean_value=train_df['floornumber'].mean()
train_df['floornumber']=train_df['floornumber'].fillna(mean_value)

In [14]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman', 'floornumber'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



2413456.579933031

### Add addr_lat, addr_lon
#### Feature:
1. bedrooms
2. year_built
3. bathrooms
4. min_to_subway
5. size_sqft
6. no_fee
7. has_doorman
8. floornumber
9. addr_lat
10. addr_lon

In [15]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman', 'floornumber', 'addr_lat', 'addr_lon'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



1764292.3148026385

### Impute 'bedrooms' with mean

In [16]:
#Impute "bedrooms"
train_df['bedrooms'].replace(0, np.nan, inplace= True)
mean_value=train_df['bedrooms'].mean()
train_df['bedrooms']=train_df['bedrooms'].fillna(mean_value)

In [17]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman', 'floornumber', 'addr_lat', 'addr_lon'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



1815886.329065073

In [18]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman', 'floornumber', 
    'addr_lat', 'addr_lon', 'addr_zip', 'has_gym',
    'has_washer_dryer', 'has_garage'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)

test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])



1768734.1212864988