In [185]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [186]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df_analysis = pd.read_csv("Resources/data_cleaned.csv", index_col=0)

# Review the DataFrame
df_analysis

Unnamed: 0,pin,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Rooms,Bedrooms,Basement,...,num_foreclosure_in_half_mile_past_5_years,num_foreclosure_per_1000_pin_past_5_years,nearest_cta_route_name,nearest_cta_stop_dist_ft,lake_michigan_dist_ft,nearest_metra_stop_dist_ft,nearest_water_dist_ft,nearest_neighbor_1_dist_ft,nearest_neighbor_2_dist_ft,nearest_neighbor_3_dist_ft
12,25153030260000,202,210,4158,70,1.0,0,4,2,4,...,203,64.55,Red Line,8641.316080,26084.996093,2717.148357,6391.361862,32.903876,32.903965,57.680425
26,20273010160000,211,111,5313,70,2.0,2,7,4,1,...,167,77.17,Red Line,2378.981155,15798.238046,7256.411321,3508.897334,29.830339,32.745589,56.233720
27,20341050210000,211,111,4500,70,2.0,2,7,4,1,...,206,71.58,Red Line,2483.343963,15881.861019,5154.668579,5247.755218,29.998205,30.015005,59.785008
63,20223010160000,203,30,3300,70,1.0,0,5,3,1,...,84,44.42,Red Line,1289.011353,13598.517588,8798.198308,6507.411449,32.998020,33.004924,64.502211
85,25011050180000,202,130,3643,70,1.0,0,5,3,1,...,79,25.86,Red Line,11968.230035,11828.693908,5331.604154,10291.059868,30.670624,60.659963,90.640905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212271,17301160040000,211,141,2400,77,1.0,2,8,4,1,...,29,13.46,Pink Line,2309.108308,19530.014594,3567.946967,2388.975064,24.001717,24.004066,47.991597
212301,17063060040000,211,52,2600,77,2.0,2,10,4,1,...,9,2.49,Blue Line,3185.879688,15656.812830,4851.155570,4911.124322,24.970062,26.459766,50.243183
212389,14311240200000,205,170,3600,77,2.0,0,9,4,1,...,11,2.88,Blue Line,1605.104188,15247.702912,5011.158050,3307.171219,30.017950,33.000013,57.338882
212394,16162141890000,295,80,1017,77,2.0,0,4,2,2,...,149,55.10,Blue Line,1952.905851,35789.669486,11523.308495,4453.705514,18.327534,24.763143,36.645742


In [187]:
# Remove columns not necessary for analysis
df_analysis = df_analysis.drop(columns=['Town and Neighborhood', 'pin', 'Property Address', 'Sale Year', 'township_name', 
                                        'property_city', 'property_state'])
# Drop any additional columns here
# df_analysis = df_analysis.drop(columns=[])

df_analysis

Unnamed: 0,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Rooms,Bedrooms,Basement,Basement Finish,...,num_foreclosure_in_half_mile_past_5_years,num_foreclosure_per_1000_pin_past_5_years,nearest_cta_route_name,nearest_cta_stop_dist_ft,lake_michigan_dist_ft,nearest_metra_stop_dist_ft,nearest_water_dist_ft,nearest_neighbor_1_dist_ft,nearest_neighbor_2_dist_ft,nearest_neighbor_3_dist_ft
12,202,210,4158,70,1.0,0,4,2,4,3.0,...,203,64.55,Red Line,8641.316080,26084.996093,2717.148357,6391.361862,32.903876,32.903965,57.680425
26,211,111,5313,70,2.0,2,7,4,1,3.0,...,167,77.17,Red Line,2378.981155,15798.238046,7256.411321,3508.897334,29.830339,32.745589,56.233720
27,211,111,4500,70,2.0,2,7,4,1,3.0,...,206,71.58,Red Line,2483.343963,15881.861019,5154.668579,5247.755218,29.998205,30.015005,59.785008
63,203,30,3300,70,1.0,0,5,3,1,1.0,...,84,44.42,Red Line,1289.011353,13598.517588,8798.198308,6507.411449,32.998020,33.004924,64.502211
85,202,130,3643,70,1.0,0,5,3,1,3.0,...,79,25.86,Red Line,11968.230035,11828.693908,5331.604154,10291.059868,30.670624,60.659963,90.640905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212271,211,141,2400,77,1.0,2,8,4,1,2.0,...,29,13.46,Pink Line,2309.108308,19530.014594,3567.946967,2388.975064,24.001717,24.004066,47.991597
212301,211,52,2600,77,2.0,2,10,4,1,3.0,...,9,2.49,Blue Line,3185.879688,15656.812830,4851.155570,4911.124322,24.970062,26.459766,50.243183
212389,205,170,3600,77,2.0,0,9,4,1,1.0,...,11,2.88,Blue Line,1605.104188,15247.702912,5011.158050,3307.171219,30.017950,33.000013,57.338882
212394,295,80,1017,77,2.0,0,4,2,2,3.0,...,149,55.10,Blue Line,1952.905851,35789.669486,11523.308495,4453.705514,18.327534,24.763143,36.645742


In [188]:
# Check and remove null values
nan_count = df_analysis.isnull().sum().sum()
print('Number of NaN values:', nan_count)
df_analysis = df_analysis.dropna()
df_analysis

Number of NaN values: 86


Unnamed: 0,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Rooms,Bedrooms,Basement,Basement Finish,...,num_foreclosure_in_half_mile_past_5_years,num_foreclosure_per_1000_pin_past_5_years,nearest_cta_route_name,nearest_cta_stop_dist_ft,lake_michigan_dist_ft,nearest_metra_stop_dist_ft,nearest_water_dist_ft,nearest_neighbor_1_dist_ft,nearest_neighbor_2_dist_ft,nearest_neighbor_3_dist_ft
12,202,210,4158,70,1.0,0,4,2,4,3.0,...,203,64.55,Red Line,8641.316080,26084.996093,2717.148357,6391.361862,32.903876,32.903965,57.680425
26,211,111,5313,70,2.0,2,7,4,1,3.0,...,167,77.17,Red Line,2378.981155,15798.238046,7256.411321,3508.897334,29.830339,32.745589,56.233720
27,211,111,4500,70,2.0,2,7,4,1,3.0,...,206,71.58,Red Line,2483.343963,15881.861019,5154.668579,5247.755218,29.998205,30.015005,59.785008
63,203,30,3300,70,1.0,0,5,3,1,1.0,...,84,44.42,Red Line,1289.011353,13598.517588,8798.198308,6507.411449,32.998020,33.004924,64.502211
85,202,130,3643,70,1.0,0,5,3,1,3.0,...,79,25.86,Red Line,11968.230035,11828.693908,5331.604154,10291.059868,30.670624,60.659963,90.640905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212271,211,141,2400,77,1.0,2,8,4,1,2.0,...,29,13.46,Pink Line,2309.108308,19530.014594,3567.946967,2388.975064,24.001717,24.004066,47.991597
212301,211,52,2600,77,2.0,2,10,4,1,3.0,...,9,2.49,Blue Line,3185.879688,15656.812830,4851.155570,4911.124322,24.970062,26.459766,50.243183
212389,205,170,3600,77,2.0,0,9,4,1,1.0,...,11,2.88,Blue Line,1605.104188,15247.702912,5011.158050,3307.171219,30.017950,33.000013,57.338882
212394,295,80,1017,77,2.0,0,4,2,2,3.0,...,149,55.10,Blue Line,1952.905851,35789.669486,11523.308495,4453.705514,18.327534,24.763143,36.645742


In [189]:
# Check property class amounts
df_analysis['Property Class'].value_counts()

Property Class
211    3390
203    3192
202    1413
205     887
295     500
278     463
206     294
204     244
212     239
210     213
234     198
207     190
208      39
209      19
Name: count, dtype: int64

In [190]:
# Get rid of outlier sale prices
df_analysis = df_analysis.drop(df_analysis[df_analysis['Sale Price'] <= 100000].index)
df_analysis = df_analysis.drop(df_analysis[df_analysis['Sale Price'] >= 1000000].index)

# Get rid of unwanted property classes
df_analysis = df_analysis.drop(df_analysis[df_analysis['Property Class'] == 211].index)
df_analysis = df_analysis.drop(df_analysis[df_analysis['Property Class'] == 212].index)
df_analysis = df_analysis.drop(df_analysis[df_analysis['Property Class'] == 210].index)
df_analysis = df_analysis.drop(df_analysis[df_analysis['Property Class'] == 295].index)
df_analysis = df_analysis.drop(df_analysis[df_analysis['Property Class'] == 234].index)

df_analysis

Unnamed: 0,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Rooms,Bedrooms,Basement,Basement Finish,...,num_foreclosure_in_half_mile_past_5_years,num_foreclosure_per_1000_pin_past_5_years,nearest_cta_route_name,nearest_cta_stop_dist_ft,lake_michigan_dist_ft,nearest_metra_stop_dist_ft,nearest_water_dist_ft,nearest_neighbor_1_dist_ft,nearest_neighbor_2_dist_ft,nearest_neighbor_3_dist_ft
85,202,130,3643,70,1.0,0,5,3,1,3.0,...,79,25.86,Red Line,11968.230035,11828.693908,5331.604154,10291.059868,30.670624,60.659963,90.640905
88,203,280,5000,70,1.0,0,4,2,1,1.0,...,75,38.96,Red Line,30628.719167,21334.582001,19260.991468,3874.155277,39.975687,39.993200,79.968141
101,205,83,5625,70,2.0,0,8,4,1,3.0,...,276,103.37,Green Line,12439.735556,3295.575031,2343.226246,3295.575031,39.042397,47.495242,84.988992
104,202,180,5103,70,1.0,0,4,2,1,3.0,...,75,32.29,Red Line,15191.034305,12175.922724,7527.763165,5530.625227,39.767487,41.087447,79.564795
165,203,280,3225,70,5.0,0,6,3,1,3.0,...,35,18.76,Red Line,34449.573940,23712.695668,21598.269739,2187.124433,25.024367,25.025674,50.059607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211255,203,40,3125,77,5.0,0,7,3,2,3.0,...,252,70.27,Green Line,6570.610024,26181.506779,6874.136989,4775.103009,25.105343,25.114614,50.220270
211359,203,151,3125,77,1.0,0,5,3,1,3.0,...,57,19.31,Blue Line,1227.619437,19162.310981,6689.081018,4482.496424,25.012356,25.013238,49.998406
211518,203,60,3125,77,5.0,0,7,4,1,3.0,...,77,21.83,Blue Line,1822.114000,15573.537881,5321.266154,3896.896968,24.995481,25.021251,43.774661
211999,278,30,3125,77,2.0,0,7,4,1,1.0,...,17,5.53,Blue Line,3721.886645,18537.459764,6967.030306,902.672069,24.986993,25.297583,49.994583


In [191]:
df_analysis['Estimated Total'] = df_analysis['Estimate (Land)'] + df_analysis['Estimate (Building)']
df_analysis['Estimated Total']

85        114210
88        129220
101       208080
104        85090
165       116550
           ...  
211255    123750
211359    487850
211518    599150
211999     79680
212208    188180
Name: Estimated Total, Length: 3214, dtype: int64

In [192]:
# Drop more variables
df_analysis = df_analysis.drop(columns=['Apartments'])

In [193]:
# List of all potential string variables for getting dummies
# ['Property Class', 'Neighborhood Code', 'Town Code', 'Type of Residence', 'Basement Finish', 
# 'Central Heating', 'Other Heating', 'Attic Type', 'Garage 1 Size', 'Garage 1 Attachment', 'Garage 1 Area', 
# 'Porch', 'Use', 'property_zip', 'ward_num', 'school_elementary_district_name', 'school_secondary_district_name', 'nearest_cta_route_name']


In [194]:
cat_dummies = pd.get_dummies(df_analysis, columns=['Property Class', 'Neighborhood Code', 'Town Code', 'Type of Residence', 'Basement Finish', 
'Central Heating', 'Other Heating', 'Attic Type', 'Garage 1 Size', 'Garage 1 Attachment', 'Garage 1 Area', 
'Porch', 'Use', 'property_zip', 'ward_num', 'school_elementary_district_name', 'school_secondary_district_name', 'nearest_cta_route_name'], drop_first=True)
cat_dummies.head()

Unnamed: 0,Land Square Feet,Rooms,Bedrooms,Basement,Central Air,Fireplaces,Half Baths,Building Square Feet,Estimate (Land),Estimate (Building),...,school_secondary_district_name_CPS SECONDARY - VOISE HS,school_secondary_district_name_CPS SECONDARY - WASHINGTON HS,school_secondary_district_name_CPS SECONDARY - WELLS HS,nearest_cta_route_name_Brown Line,nearest_cta_route_name_Green Line,nearest_cta_route_name_Orange Line,nearest_cta_route_name_Pink Line,nearest_cta_route_name_Purple Line,nearest_cta_route_name_Red Line,nearest_cta_route_name_Yellow Line
85,3643,5,3,1,False,0,0,982,29140,85070,...,False,False,False,False,False,False,False,False,True,False
88,5000,4,2,1,True,0,0,1044,30000,99220,...,False,True,False,False,False,False,False,False,True,False
101,5625,8,4,1,False,0,1,1998,70310,137770,...,False,False,False,False,True,False,False,False,False,False
104,5103,4,2,1,False,0,0,797,25510,59580,...,False,False,False,False,False,False,False,False,True,False
165,3225,6,3,1,False,0,1,1260,19350,97200,...,False,True,False,False,False,False,False,False,True,False


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [195]:
# Separate the data into labels and features

# Separate the y variable, the labels
# YOUR CODE HERE!]
y = cat_dummies["Sale Price"]

# Separate the X variable, the features
X = cat_dummies.drop(columns=["Sale Price"])

In [196]:
# Review the y variable Series
y.head()

85     172000
88     120000
101    329000
104    107200
165    115000
Name: Sale Price, dtype: int64

In [197]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Land Square Feet,Rooms,Bedrooms,Basement,Central Air,Fireplaces,Half Baths,Building Square Feet,Estimate (Land),Estimate (Building),...,school_secondary_district_name_CPS SECONDARY - VOISE HS,school_secondary_district_name_CPS SECONDARY - WASHINGTON HS,school_secondary_district_name_CPS SECONDARY - WELLS HS,nearest_cta_route_name_Brown Line,nearest_cta_route_name_Green Line,nearest_cta_route_name_Orange Line,nearest_cta_route_name_Pink Line,nearest_cta_route_name_Purple Line,nearest_cta_route_name_Red Line,nearest_cta_route_name_Yellow Line
85,3643,5,3,1,False,0,0,982,29140,85070,...,False,False,False,False,False,False,False,False,True,False
88,5000,4,2,1,True,0,0,1044,30000,99220,...,False,True,False,False,False,False,False,False,True,False
101,5625,8,4,1,False,0,1,1998,70310,137770,...,False,False,False,False,True,False,False,False,False,False
104,5103,4,2,1,False,0,0,797,25510,59580,...,False,False,False,False,False,False,False,False,True,False
165,3225,6,3,1,False,0,1,1260,19350,97200,...,False,True,False,False,False,False,False,False,True,False


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [198]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(2410, 634)

In [199]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


In [200]:
# Creating StandardScaler instance
scaler = StandardScaler()


In [201]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [202]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [203]:
# Create a random forest classifier
rf_model = RandomForestRegressor(n_estimators=500, random_state=78)

In [204]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [205]:
predictions = rf_model.predict(X_test_scaled)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [206]:
# Make a prediction using the testing data
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,282449.996,272400
1,272765.874,375900
2,173076.906,105000
3,353301.72,540000
4,158834.474,165000
5,282161.896,250000
6,296925.456,182500
7,337399.766,250000
8,177397.116,171000
9,323175.426,213000


In [207]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

Mean Absolute Error: 60415.25187562189
Mean Squared Error: 7113266660.44435
Root Mean Squared Error: 84340.18413807472
