# Capstone Two: Modeling

In [1]:
#os.getcwd()
#os.chdir ('Springboard_Debisree/predicting-cab-booking-cancellations/')

import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime


#ignore warning messages to ensure clean outputs
import warnings
warnings.filterwarnings('ignore')

In [2]:
realtor_data = pd.read_csv('realtor-data.csv')
# drop rows with missing value in the city,price and zip_colde columns

realtor_data2_drop=realtor_data
realtor_data2_drop.dropna(subset =['zip_code','price','city'],inplace=True)
missing = pd.concat([realtor_data2_drop.isnull().sum(), 100 * realtor_data2_drop.isnull().mean()], axis=1)
missing.columns=['count','%']
missing.sort_values(by='%')

Unnamed: 0,count,%
status,0,0.0
city,0,0.0
state,0,0.0
zip_code,0,0.0
price,0,0.0
bath,193804,13.838932
bed,216158,15.43516
acre_lot,357319,25.515022
house_size,449764,32.116227
prev_sold_date,685717,48.964886


In [3]:
# Check data types of columns
for column in realtor_data2_drop.columns[realtor_data2_drop.isnull().sum() > 0]:
    if realtor_data2_drop[column].dtype != 'object':  # Check if column is numeric
        mean_val = realtor_data2_drop[column].mean()
        realtor_data2_drop[column].fillna(mean_val, inplace=True)

In [4]:
# fill missing value with mean
  
for column in realtor_data2_drop.columns[realtor_data2_drop.isnull().sum() > 0]:
    if realtor_data2_drop[column].dtype != 'object':  # Check if column is numeric
        mean_val = realtor_data2_drop[column].mean()
        realtor_data2_drop[column].fillna(mean_val, inplace=True)

In [5]:
# turns status column into a dummy variable 
dummy=pd.get_dummies(realtor_data2_drop['status'])
df=pd.concat([realtor_data2_drop,dummy],axis=1)
df=realtor_data2_drop.merge(dummy,left_index=True,right_index=True)
df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price,for_sale,ready_to_build
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0,1,0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0,1,0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0,1,0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0,1,0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,2178.735694,,65000.0,1,0


In [6]:
df=df.drop(columns=['city','state','prev_sold_date','status'])
df.head()

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,price,for_sale,ready_to_build
0,3.0,2.0,0.12,601.0,920.0,105000.0,1,0
1,4.0,2.0,0.08,601.0,1527.0,80000.0,1,0
2,2.0,1.0,0.15,795.0,748.0,67000.0,1,0
3,4.0,2.0,0.1,731.0,1800.0,145000.0,1,0
4,6.0,2.0,0.05,680.0,2178.735694,65000.0,1,0


In [7]:
len(df) * .7, len(df) * .3

(980298.2, 420127.8)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='price'), 
                                                    df.price, test_size=0.3, 
                                                    random_state=47)

In [9]:
X_train.shape, X_test.shape

((980298, 7), (420128, 7))

In [10]:
y_train.shape, y_test.shape

((980298,), (420128,))

In [11]:
X_train.dtypes

bed               float64
bath              float64
acre_lot          float64
zip_code          float64
house_size        float64
for_sale            uint8
ready_to_build      uint8
dtype: object

In [12]:
X_test.dtypes

bed               float64
bath              float64
acre_lot          float64
zip_code          float64
house_size        float64
for_sale            uint8
ready_to_build      uint8
dtype: object

In [13]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# use linear regression model

In [14]:
lm = LinearRegression().fit(X_train_scaled, y_train)
y_train_pred = lm.predict(X_train_scaled)
y_test_pred = lm.predict(X_test_scaled)

In [15]:
# Assess model performance
r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)

(0.06848774387479661, 0.12400501680557019)

In [16]:
mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)

(613564.1023944328, 611785.4183060728)

In [17]:
mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)

(7485952237196.852, 4601873192794.038)

In [18]:
pipe = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(), 
    LinearRegression()
)

In [19]:
type(pipe)

sklearn.pipeline.Pipeline

In [20]:
hasattr(pipe, 'fit'), hasattr(pipe, 'predict')

(True, True)

In [21]:
pipe.fit(X_train,y_train)

In [22]:
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)

In [23]:
# Assess performance
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.06848774387479661, 0.12400501680557019)

In [24]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(613564.1023944328, 611785.4183060728)

In [25]:
# Define a new pipeline to select a different number of features
pipe7 = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    SelectKBest(f_regression, k=7),
    LinearRegression()
)

In [26]:
pipe7.fit(X_train, y_train)

In [27]:
y_tr_pred = pipe7.predict(X_train)
y_te_pred = pipe7.predict(X_test)

In [28]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.06848774387479661, 0.12400501680557019)

In [29]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(613564.1023944328, 611785.4183060728)

# Random Forest Model

In [37]:
realtor_data = pd.read_csv('realtor-data.csv')

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,price,for_sale,ready_to_build
0,3.0,2.0,0.12,601.0,920.0,105000.0,1,0
1,4.0,2.0,0.08,601.0,1527.0,80000.0,1,0
2,2.0,1.0,0.15,795.0,748.0,67000.0,1,0
3,4.0,2.0,0.1,731.0,1800.0,145000.0,1,0
4,6.0,2.0,0.05,680.0,2178.735694,65000.0,1,0


In [39]:
df2=realtor_data.drop(columns=['city','state','prev_sold_date','status'])
df2.head()

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,price
0,3.0,2.0,0.12,601.0,920.0,105000.0
1,4.0,2.0,0.08,601.0,1527.0,80000.0
2,2.0,1.0,0.15,795.0,748.0,67000.0
3,4.0,2.0,0.1,731.0,1800.0,145000.0
4,6.0,2.0,0.05,680.0,2178.735694,65000.0


In [35]:
# Select features and target variable
X = df[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'for_sale', 'ready_to_build']]
y = df['price']

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Example parameters, adjust as needed

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 771883827830.5082
R-squared: 0.8684630185113822
