# Importing Libraries

In [1]:
import numpy                 as np
import pandas                as pd
import matplotlib.pyplot     as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model    import LinearRegression

# Dataset

In [2]:
df = pd.read_csv('StreetEasy.csv')

In [3]:
df.shape

(3539, 18)

In [4]:
df.head()

Unnamed: 0,rental_id,rent,bedrooms,bathrooms,size_sqft,min_to_subway,floor,building_age_yrs,no_fee,has_roofdeck,has_washer_dryer,has_doorman,has_elevator,has_dishwasher,has_patio,has_gym,neighborhood,borough
0,1545,2550,0.0,1,480,9,2.0,17,1,1,0,0,1,1,0,1,Upper East Side,Manhattan
1,2472,11500,2.0,2,2000,4,1.0,96,0,0,0,0,0,0,0,0,Greenwich Village,Manhattan
2,2919,4500,1.0,1,916,2,51.0,29,0,1,0,1,1,1,0,0,Midtown,Manhattan
3,2790,4795,1.0,1,975,3,8.0,31,0,0,0,1,1,1,0,1,Greenwich Village,Manhattan
4,3946,17500,2.0,2,4800,3,4.0,136,0,0,0,1,1,1,0,1,Soho,Manhattan


## Renaming

In [5]:
df.columns = ["ID", "Rent", "No. Beedrooms", "No. Bathrooms", "Area (sqft)", "Minutes to Subway", "Floor", "Building Age (Years)",
              "No Borook", "Roof", "Washer/Dryer", "Doorman", "Elevator", "Dishwasher", "Patio", "Gym", "Neighborhood", "Borough"]

In [6]:
df.head()

Unnamed: 0,ID,Rent,No. Beedrooms,No. Bathrooms,Area (sqft),Minutes to Subway,Floor,Building Age (Years),No Borook,Roof,Washer/Dryer,Doorman,Elevator,Dishwasher,Patio,Gym,Neighborhood,Borough
0,1545,2550,0.0,1,480,9,2.0,17,1,1,0,0,1,1,0,1,Upper East Side,Manhattan
1,2472,11500,2.0,2,2000,4,1.0,96,0,0,0,0,0,0,0,0,Greenwich Village,Manhattan
2,2919,4500,1.0,1,916,2,51.0,29,0,1,0,1,1,1,0,0,Midtown,Manhattan
3,2790,4795,1.0,1,975,3,8.0,31,0,0,0,1,1,1,0,1,Greenwich Village,Manhattan
4,3946,17500,2.0,2,4800,3,4.0,136,0,0,0,1,1,1,0,1,Soho,Manhattan


## Removing Non-numerical Values

### Dataset Of Independent Variables

In [7]:
X = df[["No. Beedrooms", "No. Bathrooms", "Area (sqft)", "Minutes to Subway", "Floor", "Building Age (Years)",
              "No Borook", "Roof", "Washer/Dryer", "Doorman", "Elevator", "Dishwasher", "Patio", "Gym"]]

In [8]:
X.shape

(3539, 14)

In [9]:
X.head()

Unnamed: 0,No. Beedrooms,No. Bathrooms,Area (sqft),Minutes to Subway,Floor,Building Age (Years),No Borook,Roof,Washer/Dryer,Doorman,Elevator,Dishwasher,Patio,Gym
0,0.0,1,480,9,2.0,17,1,1,0,0,1,1,0,1
1,2.0,2,2000,4,1.0,96,0,0,0,0,0,0,0,0
2,1.0,1,916,2,51.0,29,0,1,0,1,1,1,0,0
3,1.0,1,975,3,8.0,31,0,0,0,1,1,1,0,1
4,2.0,2,4800,3,4.0,136,0,0,0,1,1,1,0,1


### Dataset of Dependent Variables

In [10]:
y = df[['Rent']]

In [11]:
y.shape

(3539, 1)

In [12]:
y.head()

Unnamed: 0,Rent
0,2550
1,11500
2,4500
3,4795
4,17500


# Splitting The Data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [14]:
X_train.shape

(2831, 14)

In [15]:
y_train.shape

(2831, 1)

In [16]:
X_test.shape

(708, 14)

In [17]:
y_test.shape

(708, 1)

# Training The Model

In [18]:
Regressor = LinearRegression()

In [19]:
Regressor.fit(X_train, y_train)

LinearRegression()

In [20]:
Regressor.coef_

array([[-340.78500086, 1113.75188182,    5.09389286,  -15.56353083,
          24.5099582 ,   -6.94641137,  -94.28773553,   14.53922629,
          42.24673489, -134.17225109,  111.55155092,   46.20628834,
        -190.81439183,   22.52139667]])

In [21]:
Regressor.intercept_

array([-520.20417668])

- Coefficients are most helpful in determining which independent variable <b>carries more weight</b>.
- For example, a coefficient of -1.345 will impact the rent more than a coefficient of 0.238, with the former impacting prices negatively and latter positively.

# Getting Predictions For Testing Set

In [22]:
y_predict = Regressor.predict(X_test)

In [23]:
y_predict[:5]

array([[3666.20914896],
       [6494.14287776],
       [3666.81738712],
       [4629.74866634],
       [4027.10441184]])

# Evaluating Model Accuracy

In [24]:
print("Train score:")
print(Regressor.score(X_train, y_train))

Train score:
0.7828648384708848


In [25]:
print("Test score:")
print(Regressor.score(X_test, y_test))

Test score:
0.7616959930542507


# Making Prediction

In [26]:
# Sonny doesn't have an elevator so the 11th item in the list is a 0
Sonny_apartment = [[1, 1, 620, 16, 1, 98, 1, 0, 1, 0, 0, 1, 1, 0]]

In [27]:
prediction = Regressor.predict(Sonny_apartment)

In [28]:
prediction

array([[2309.07232733]])