### Importing Dependencies

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

In [4]:
from sklearn.datasets import fetch_california_housing

In [6]:
data = fetch_california_housing()

In [8]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [30]:
# Independent Data
df = pd.DataFrame(data = data.data, columns = data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [20]:
df.shape

(20640, 8)

In [31]:
# Dependent Data
df['Target'] = data.target

In [32]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


### EDA

In [33]:
!pip install sweetviz

Defaulting to user installation because normal site-packages is not writeable
Collecting sweetviz
  Obtaining dependency information for sweetviz from https://files.pythonhosted.org/packages/89/50/8d3f7ca820267a38b42c64778db0f186d85cec6e3f0c5210eab9c16e783f/sweetviz-2.3.1-py3-none-any.whl.metadata
  Downloading sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Collecting importlib-resources>=1.2.0 (from sweetviz)
  Obtaining dependency information for importlib-resources>=1.2.0 from https://files.pythonhosted.org/packages/93/e8/facde510585869b5ec694e8e0363ffe4eba067cb357a8398a55f6a1f8023/importlib_resources-6.1.1-py3-none-any.whl.metadata
  Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)
Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
   ---------------------------------------- 0.0/15.1 MB ? eta -:--:--
   ---------------------------------------- 0.2/15.1 MB 5.0 MB/s eta 0:00:04
   - -------------------------------------- 0.5/15.1 MB 7.1 MB/s eta 0:00:03
 


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
import sweetviz as sv

In [35]:
report = sv.analyze(df)
report.show_html('report.html')

                                             |                                             | [  0%]   00:00 ->…

Report report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


### Data-Preprocessing

In [36]:
!pip install geopy

Defaulting to user installation because normal site-packages is not writeable
Collecting geopy
  Obtaining dependency information for geopy from https://files.pythonhosted.org/packages/e5/15/cf2a69ade4b194aa524ac75112d5caac37414b20a3a03e6865dfe0bd1539/geopy-2.4.1-py3-none-any.whl.metadata
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
     ---------------------------------------- 0.0/40.3 kB ? eta -:--:--
     ---------------------------------------- 40.3/40.3 kB 1.9 MB/s eta 0:00:00
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   ---------------------------------------- 125.4/125.4 kB 7.7 MB/s eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [48]:
from geopy.geocoders import Nominatim

In [65]:
geolocator = Nominatim(user_agent='joshua2009474@gmail.com')

geolocator.reverse("37.88"+" , "+"-122.23").raw['address']
# Raw returns dictionary 

{'leisure': 'Ecological Study Area',
 'road': 'Vollmer Peak Trail',
 'county': 'Contra Costa County',
 'state': 'California',
 'ISO3166-2-lvl4': 'US-CA',
 'postcode': '94563',
 'country': 'United States',
 'country_code': 'us'}

In [69]:
def location(cord):
    Lat = str(cord[0])
    Long = str(cord[1])

    location = geolocator.reverse(Lat+","+Long).raw['address']

    if location.get('road') is None:
        location['road'] = None

    if location.get('county') is None:
        location['county'] = None

    loc_update['County'].append(location['county'])
    loc_update['Road'].append(location['road']) 

In [71]:
# import pickle

# loc_update = {'County':[],
#               'Road':[]}

# for i, cord in enumerate(df.iloc[:,6:-1].values):
#     location(cord)
#     pickle.dump(loc_update, open('loc_update.pickle', 'wb'))

#     if i%100 == 0:
#         print(i)