## Loading dataset

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score , GridSearchCV,ShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
import warnings 
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("./housing.csv")

## Preprocessing

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
# df2 =df.drop(['longitude','latitude','median_income','median_income','population','households'],axis='columns')
# df2.head()

In [7]:
df2 = df

In [8]:
df2.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [9]:
imputer = SimpleImputer(strategy='median')
df2['total_bedrooms'] = imputer.fit_transform(df2[['total_bedrooms']])

In [10]:
df2.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [11]:
df2[df2['ocean_proximity'].str.contains('<')].count()

longitude             9136
latitude              9136
housing_median_age    9136
total_rooms           9136
total_bedrooms        9136
population            9136
households            9136
median_income         9136
median_house_value    9136
ocean_proximity       9136
dtype: int64

In [12]:
df2['ocean_proximity'] = df2['ocean_proximity'].str.replace('<','')

In [13]:
df2[df2['ocean_proximity'].str.contains('<')].count()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [14]:
df2.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
7533,-118.22,33.91,31.0,571.0,153.0,841.0,158.0,2.6154,89200.0,1H OCEAN
8908,-118.47,34.03,31.0,2642.0,681.0,1303.0,625.0,3.5987,340500.0,1H OCEAN
18538,-122.01,36.98,27.0,2820.0,730.0,1511.0,745.0,2.589,242400.0,NEAR OCEAN
17205,-119.72,34.43,30.0,2491.0,656.0,1091.0,576.0,2.5139,279500.0,1H OCEAN
3379,-118.27,34.25,37.0,2489.0,454.0,1215.0,431.0,5.0234,257600.0,1H OCEAN


In [15]:
dummies = pd.get_dummies(df2['ocean_proximity'])
dummies.head()

Unnamed: 0,1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,False,False,False,True,False
1,False,False,False,True,False
2,False,False,False,True,False
3,False,False,False,True,False
4,False,False,False,True,False


In [16]:
merged = pd.concat([df2,dummies],axis='columns')
merged

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,False,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,False,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,False,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,False,True,False,False,False


In [17]:
final = merged.drop(['ocean_proximity','NEAR OCEAN'],axis='columns')
final.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,1H OCEAN,INLAND,ISLAND,NEAR BAY
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True


In [18]:
final.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,1H OCEAN,INLAND,ISLAND,NEAR BAY
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True


## Removing Outliers

In [19]:
upper_limit = final.total_rooms.mean() + 3 * final.total_rooms.std()
upper_limit

9180.60883614371

In [20]:
lower_limit = final.total_rooms.mean() - 3 * final.total_rooms.std()
lower_limit

-3909.0826733530125

In [21]:
final[final['total_rooms'] > upper_limit ].count()

longitude             373
latitude              373
housing_median_age    373
total_rooms           373
total_bedrooms        373
population            373
households            373
median_income         373
median_house_value    373
1H OCEAN              373
INLAND                373
ISLAND                373
NEAR BAY              373
dtype: int64

In [22]:
final[final['total_rooms'] < lower_limit].count()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
1H OCEAN              0
INLAND                0
ISLAND                0
NEAR BAY              0
dtype: int64

In [23]:
final.shape

(20640, 13)

In [24]:
def remove_outliers(df,column_name):
    upper_limit = df[column_name].mean() + 3 * df[column_name].std()
    lower_limit = df[column_name].mean() - 3 * df[column_name].std()
    
    
    outliers = (df[column_name] > upper_limit) | (df[column_name] < lower_limit)
    df_cleaned = df[~outliers]
    
    return df_cleaned

In [25]:
df5 = remove_outliers(final,'total_rooms')

In [26]:
df5.shape

(20267, 13)

In [27]:
df6 = remove_outliers(df5,'population')
df6.shape

(19949, 13)

In [28]:
df7 = remove_outliers(df6,'households')
df7.shape

(19655, 13)

In [29]:
df8 = remove_outliers(df5,'median_house_value')
df8.shape

(20267, 13)

In [30]:
df8.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20267 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20267 non-null  float64
 1   latitude            20267 non-null  float64
 2   housing_median_age  20267 non-null  float64
 3   total_rooms         20267 non-null  float64
 4   total_bedrooms      20267 non-null  float64
 5   population          20267 non-null  float64
 6   households          20267 non-null  float64
 7   median_income       20267 non-null  float64
 8   median_house_value  20267 non-null  float64
 9   1H OCEAN            20267 non-null  bool   
 10  INLAND              20267 non-null  bool   
 11  ISLAND              20267 non-null  bool   
 12  NEAR BAY            20267 non-null  bool   
dtypes: bool(4), float64(9)
memory usage: 1.6 MB


In [31]:
df9 = remove_outliers(df8,'total_bedrooms')

In [32]:
df9.shape

(19899, 13)

In [33]:
df9.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,1H OCEAN,INLAND,ISLAND,NEAR BAY
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True


In [34]:
df10 = remove_outliers(df9,'median_income')
df10.shape

(19565, 13)

In [35]:
df9.to_csv('D:/California Housing price dataset/modified2.csv')

In [36]:
x = df10.drop(['median_house_value'],axis='columns')
y = df10.median_house_value

In [37]:
x.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,1H OCEAN,INLAND,ISLAND,NEAR BAY
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,False,False,False,True
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,False,False,False,True
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,False,False,False,True
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,False,False,False,True
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,False,False,False,True


In [38]:
x.shape

(19565, 12)

## Standardization

In [39]:
scaler = StandardScaler()

In [40]:
x_scale = scaler.fit_transform(x)

## dimensionality reduction

In [41]:
pca = PCA(0.996)
x_pca = pca.fit_transform(x_scale)

In [42]:
x_pca.shape

(19565, 11)

In [43]:
from sklearn.linear_model import Ridge

In [44]:
def find_best_model_with_gridsearchcv(l,m):
    algorithms = {
        'linear_regression':{
            'model':LinearRegression(),
            'params': {
               
                'fit_intercept':[True,False],
                'copy_X':[True,False]
                
            }
            
        },
        'Knn':{
            'model':KNeighborsRegressor(),
            'params':{
                'n_neighbors':[1,3,5,8,11,13,15],
                'weights':['uniform','distance'],
                'algorithm':['auto']
                      
            
                     }
            
        }
        
        
        
    }
    scores = []
    cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    for algo_name, config in algorithms.items():
        gs = GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)
        gs.fit(l,m)
        scores.append({
            
            'model': algo_name,
            'best_score':gs.best_score_,
            'best_params':gs.best_params_
            
            
        })
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [45]:
find_best_model_with_gridsearchcv(x_pca,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.631474,"{'copy_X': True, 'fit_intercept': True}"
1,Knn,0.712655,"{'algorithm': 'auto', 'n_neighbors': 13, 'weig..."
