In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error,mean_squared_error

In [2]:
df=pd.read_csv("American_Housing_Data_20231209.csv")
df.head()

Unnamed: 0,Zip Code,Price,Beds,Baths,Living Space,Address,City,State,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude
0,10013,3999000.0,2,3,1967,74 GRAND ST APT 3,New York,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
1,10013,3999000.0,2,3,1967,74 GRAND ST APT 3,New York,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
2,10014,1650000.0,1,1,718,140 CHARLES ST APT 4D,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601
3,10014,760000.0,3,2,1538,38 JONES ST,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601
4,10014,1100000.0,1,1,600,81 BEDFORD ST APT 3F,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601


In [3]:
df.shape

(39981, 14)

In [4]:
df.isnull().sum()

Zip Code                   0
Price                      0
Beds                       0
Baths                      0
Living Space               0
Address                    0
City                       0
State                      0
Zip Code Population        0
Zip Code Density           0
County                     0
Median Household Income    2
Latitude                   0
Longitude                  0
dtype: int64

In [5]:
numeric_data=df.select_dtypes(include=["int64","float64"])
for x in numeric_data:
    df[x].fillna(df[x].mean(),inplace=True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mean(),inplace=True)


Zip Code                   0
Price                      0
Beds                       0
Baths                      0
Living Space               0
Address                    0
City                       0
State                      0
Zip Code Population        0
Zip Code Density           0
County                     0
Median Household Income    0
Latitude                   0
Longitude                  0
dtype: int64

In [6]:
numeric_data=df.select_dtypes(include=["object","category"])
for x in numeric_data:
    df[x].fillna(df[x].mode()[0],inplace=True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mode()[0],inplace=True)


Zip Code                   0
Price                      0
Beds                       0
Baths                      0
Living Space               0
Address                    0
City                       0
State                      0
Zip Code Population        0
Zip Code Density           0
County                     0
Median Household Income    0
Latitude                   0
Longitude                  0
dtype: int64

In [7]:
df.head()

Unnamed: 0,Zip Code,Price,Beds,Baths,Living Space,Address,City,State,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude
0,10013,3999000.0,2,3,1967,74 GRAND ST APT 3,New York,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
1,10013,3999000.0,2,3,1967,74 GRAND ST APT 3,New York,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
2,10014,1650000.0,1,1,718,140 CHARLES ST APT 4D,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601
3,10014,760000.0,3,2,1538,38 JONES ST,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601
4,10014,1100000.0,1,1,600,81 BEDFORD ST APT 3F,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601


In [8]:
encoder=LabelEncoder()
category=df.select_dtypes(include=["object","category"])
for x in category:
    df[x]=encoder.fit_transform(df[x])
df.head()

Unnamed: 0,Zip Code,Price,Beds,Baths,Living Space,Address,City,State,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude
0,10013,3999000.0,2,3,1967,33468,150,18,29563,20967.9,57,370046.0,40.72001,-74.00472
1,10013,3999000.0,2,3,1967,33468,150,18,29563,20967.9,57,370046.0,40.72001,-74.00472
2,10014,1650000.0,1,1,718,5888,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601
3,10014,760000.0,3,2,1538,20579,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601
4,10014,1100000.0,1,1,600,35131,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601


In [9]:
X=df.drop(["Price"],axis=1)
y=df["Price"]


In [10]:
X.head()

Unnamed: 0,Zip Code,Beds,Baths,Living Space,Address,City,State,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude
0,10013,2,3,1967,33468,150,18,29563,20967.9,57,370046.0,40.72001,-74.00472
1,10013,2,3,1967,33468,150,18,29563,20967.9,57,370046.0,40.72001,-74.00472
2,10014,1,1,718,5888,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601
3,10014,3,2,1538,20579,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601
4,10014,1,1,600,35131,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601


In [11]:
y.head()

0    3999000.0
1    3999000.0
2    1650000.0
3     760000.0
4    1100000.0
Name: Price, dtype: float64

In [17]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y, test_size=0.20,random_state=1)

In [18]:
from sklearn.pipeline import Pipeline
Linearmodel=Pipeline([("scaler",StandardScaler()),("Regression",LinearRegression())])
Linearmodel.fit(xtrain,ytrain)

In [19]:
pred1=Linearmodel.predict(xtest)
pred1

array([ 865835.80234204,  921108.07393493,  650078.39709909, ...,
        303056.11653198,  374775.69761161, -132989.77414582])

In [20]:
print('MAE',mean_absolute_error(ytest,pred1))
print("Mape",mean_absolute_percentage_error(ytest,pred1))
print("mse",mean_squared_error(ytest,pred1))
print("R2 SCORE",r2_score(ytest,pred1))

MAE 290307.7721603077
Mape 0.8306735036782602
mse 460405499743.7724
R2 SCORE 0.4489295513315734


In [21]:
outcome={
    "Actual price":ytest,
    "predicted Price":pred1
}
pd.DataFrame(outcome)

Unnamed: 0,Actual price,predicted Price
33280,959000.0,865835.802342
21569,650000.0,921108.073935
33018,700000.0,650078.397099
1305,329900.0,512874.073584
30217,498750.0,694622.548816
...,...,...
23066,265000.0,19144.654505
15574,165000.0,89637.191743
18753,775000.0,303056.116532
39015,525000.0,374775.697612
