In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
df = pd.read_csv("USA Housing Dataset copy.csv")

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4140 entries, 0 to 4139
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4140 non-null   object 
 1   price          4140 non-null   float64
 2   bedrooms       4140 non-null   float64
 3   bathrooms      4140 non-null   float64
 4   sqft_living    4140 non-null   int64  
 5   sqft_lot       4140 non-null   int64  
 6   floors         4140 non-null   float64
 7   waterfront     4140 non-null   int64  
 8   view           4140 non-null   int64  
 9   condition      4140 non-null   int64  
 10  sqft_above     4140 non-null   int64  
 11  sqft_basement  4140 non-null   int64  
 12  yr_built       4140 non-null   int64  
 13  yr_renovated   4140 non-null   int64  
 14  street         4140 non-null   object 
 15  city           4140 non-null   object 
 16  statezip       4140 non-null   object 
 17  country        4140 non-null   object 
dtypes: float

In [38]:
df.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0,4140.0
mean,553062.9,3.400483,2.163043,2143.638889,14697.64,1.51413,0.007488,0.246618,3.452415,1831.351449,312.28744,1970.81401,808.368357
std,583686.5,0.903939,0.784733,957.481621,35876.84,0.534941,0.086219,0.790619,0.678533,861.382947,464.349222,29.807941,979.380535
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,320000.0,3.0,1.75,1470.0,5000.0,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,460000.0,3.0,2.25,1980.0,7676.0,1.5,0.0,0.0,3.0,1600.0,0.0,1976.0,0.0
75%,659125.0,4.0,2.5,2620.0,11000.0,2.0,0.0,0.0,4.0,2310.0,602.5,1997.0,1999.0
max,26590000.0,8.0,6.75,10040.0,1074218.0,3.5,1.0,4.0,5.0,8020.0,4820.0,2014.0,2014.0


<h2>DIVIDING BY MAXIMUM</h2>In this we will take the elements of a particular column and then divide it by the max value in that column. This way, our data will range between 0 to 1. 

In [39]:
sqft_living = df["sqft_living"].astype(float)
sqft_living.describe()

count     4140.000000
mean      2143.638889
std        957.481621
min        370.000000
25%       1470.000000
50%       1980.000000
75%       2620.000000
max      10040.000000
Name: sqft_living, dtype: float64

In [40]:
minimum = min(sqft_living)
maximum = max(sqft_living)
print(f"Minimum value is {minimum} and maximum value is {maximum}")

Minimum value is 370.0 and maximum value is 10040.0


In [41]:
#Range of our data:
lower = minimum/maximum
upper = maximum/maximum
print(f"Our data ranges from {lower} to {upper}")

Our data ranges from 0.036852589641434265 to 1.0


In [43]:
sqft_living = sqft_living/sqft_living.max()

In [44]:
sqft_living.describe()

count    4140.000000
mean        0.213510
std         0.095367
min         0.036853
25%         0.146414
50%         0.197211
75%         0.260956
max         1.000000
Name: sqft_living, dtype: float64

<h2>MEAN NORMALIZATION</h2> We use µ in this technique. x1 = x1-µ/max-min

In [45]:
sqft_lot = df['sqft_lot'].astype(float)

In [46]:
sqft_lot.describe()

count    4.140000e+03
mean     1.469764e+04
std      3.587684e+04
min      6.380000e+02
25%      5.000000e+03
50%      7.676000e+03
75%      1.100000e+04
max      1.074218e+06
Name: sqft_lot, dtype: float64

In [51]:
mu = sqft_lot.mean()
mu

np.float64(14697.638164251208)

In [53]:
for i in range(len(sqft_lot)):
    sqft_lot[i] = (sqft_lot[i] - mu)/(sqft_lot.max() - sqft_lot.min())

In [54]:
sqft_lot.describe()

count    4140.000000
mean       -0.006375
std         0.101583
min        -1.246146
25%        -0.019845
50%        -0.012334
75%        -0.005973
max         0.986318
Name: sqft_lot, dtype: float64

<h2>Z-SCORE NORMALIZATION</h2> Over here we use standard deviation. x1 = x1-µ1/σ

In [55]:
sqft_above = df['sqft_above'].astype(float)

In [57]:
sqft_above.describe()

count    4140.000000
mean     1831.351449
std       861.382947
min       370.000000
25%      1190.000000
50%      1600.000000
75%      2310.000000
max      8020.000000
Name: sqft_above, dtype: float64

In [60]:
sigma = sqft_above.std()
mu = sqft_above.mean()
print(f"standard deviation:{sigma}, mean:{mu}")

standard deviation:861.3829469017647, mean:1831.3514492753623


In [61]:
for i in range(len(sqft_above)):
    sqft_above[i] = (sqft_above[i]-mu)/sigma

In [62]:
sqft_above.describe()

count    4.140000e+03
mean     7.894919e-17
std      1.000000e+00
min     -1.696518e+00
25%     -7.445602e-01
50%     -2.685814e-01
75%      5.556745e-01
max      7.184550e+00
Name: sqft_above, dtype: float64