In [1]:
# Required packages
import pandas as pd
import numpy as np
import geopandas as gpd

# Imputing Data
from sklearn.impute import SimpleImputer

# Display
import progressbar

# Calgary Property Assessments


The assessed values of residential, non-residential and farm land properties in Calgary. The properties in this dataset consist of Calgary lands that have a registered parcel at Alberta’s Land Titles Office. Properties that are on The City of Calgary’s annual property assessment rolls, but excluded from this dataset, are titled parking stalls, titled storage units, machinery & equipment property, and linear property (as defined in the Municipal Government Act). This dataset can be downloaded from [here](https://data.calgary.ca/dataset/Property-Assessments/6zp6-pxei).

For detailed property assessment information, visit https://assessmentsearch.calgary.ca

In [2]:
Data = pd.read_csv('Calgary/Property_Assessments.csv')

#### Table of contents

* [Preprocessing](#Preprocessing)
    * [Assessed Value Column](#Assessed-Value-Column)
    * [Community Code and Community Name Columns](#Community-Code-and-Community-Name-Columns)
    * [Latitude, Longitude and Location Columns](#Latitude,-Longitude-and-Location-Columns)
    * [Changing the Dataframe Column Names to Title Case](#Changing-the-Dataframe-Column-Names-to-Title-Case)
    * [Adding More Details](#Adding-More-Details)
        * [Loading a geojson file](#Loading-a-geojson-file)
        * [Processing the geojson file](#Processing-the-geojson-file)
* [Processed Data](#Processed-Data)

# Preprocessing

The list of columns:

In [3]:
Columns_list = Data.columns.tolist()
print(Columns_list)

['ROLL_YEAR', 'ROLL_NUMBER', 'ADDRESS', 'ASSESSED_VALUE', 'ASSESSMENT_CLASS', 'ASSESSMENT_CLASS_DESCRIPTION', 'RE_ASSESSED_VALUE', 'NR_ASSESSED_VALUE', 'FL_ASSESSED_VALUE', 'COMM_CODE', 'COMM_NAME', 'LATITUDE', 'LONGITUDE', 'location', 'UNIQUE_KEY', 'YEAR_OF_CONSTRUCTION']


In [4]:
def Data_info(Inp, Only_NaN = False):
    Out = pd.DataFrame(Inp.dtypes,columns=['Data Type']).sort_values(by=['Data Type'])
    Out = Out.join(pd.DataFrame(Inp.isnull().sum(), columns=['Number of NaN Values']), how='outer')
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out

In [5]:
Data_info(Data, True)

Unnamed: 0,Data Type,Number of NaN Values,Percentage
ADDRESS,object,186,0.0
ASSESSED_VALUE,float64,1860,0.03
COMM_CODE,object,2,0.0
COMM_NAME,object,71,0.0
FL_ASSESSED_VALUE,float64,7302699,99.86
LATITUDE,float64,31464,0.43
LONGITUDE,float64,31464,0.43
NR_ASSESSED_VALUE,float64,6969645,95.31
RE_ASSESSED_VALUE,float64,344734,4.71
YEAR_OF_CONSTRUCTION,float64,6792235,92.88


As can be see, there are a consider number of **NaN** values that we need to deal with them before starting this study.

## Assessed Value Column

In [6]:
def Search_Columns(Inp, Columns = Columns_list):
    ''' Search for specific value (Inp) in the column list '''
    mylist=list()
    for i in range(len(Columns)):
        if Columns[i].find(Inp) != -1:
            mylist.append(Columns[i])
    return mylist

First off, let's create a DataFrame from rows that **Assessed Value**s are **NaN**.

In [7]:
mylist = Search_Columns('VALUE')
Null = Data[Data.ASSESSED_VALUE.isna()][mylist]
Null.head()

Unnamed: 0,ASSESSED_VALUE,RE_ASSESSED_VALUE,NR_ASSESSED_VALUE,FL_ASSESSED_VALUE
578475,,,,
578476,,,,
651608,,,,
752796,,,,
752797,,,,


We would like to check whether there is a row that one of the last three columns has a non-NaN value.

In [8]:
Null[Null.iloc[:, 1:].notnull().all(axis=1)]

Unnamed: 0,ASSESSED_VALUE,RE_ASSESSED_VALUE,NR_ASSESSED_VALUE,FL_ASSESSED_VALUE


There is no such row, and we drop all of these rows.

In [9]:
Data.dropna(subset=['ASSESSED_VALUE'], how='all', inplace=True)

## Community Code and Community Name Columns

First off,

In [10]:
mylist = Search_Columns('COMM')
mylist1 = np.where(Data['COMM_NAME'].isna())[0].tolist()
Data.iloc[mylist1,:][mylist]

Unnamed: 0,COMM_CODE,COMM_NAME
1759602,12E,
1774800,12E,
1775521,12E,
1776273,12E,
1779952,12E,
...,...,...
6038643,06B,
6359859,14U,
6365728,14U,
6368191,06B,


We can use the information on Community Name Column and fill Community Code column.

In [11]:
Temp = Data.copy()
Temp = Temp[Temp.COMM_CODE.notna()]
Temp = Temp.drop_duplicates('COMM_CODE')
Temp = Temp[mylist].reset_index(drop=True)
Temp.head()

Unnamed: 0,COMM_CODE,COMM_NAME
0,DIS,DISCOVERY RIDGE
1,BRI,BRIDLEWOOD
2,BOW,BOWNESS
3,CPF,COPPERFIELD
4,CRA,CRANSTON


Now, we can this for completing **COMM_NAME** column

In [12]:
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval=len(mylist1), widgets=[progressbar.Bar('=', '|', '|'),
                                                                    progressbar.Percentage()])
Progress_Bar.start()
for i in mylist1:
    Counter+=1
    Progress_Bar.update(Counter)
    for j in range(len(Temp)):
        if Data.iloc[i,9]==Temp.iloc[j,0]:
            Data.iloc[i,10]=Temp.iloc[j,1]
# End of the seraching process
Progress_Bar.finish()



Now

In [13]:
Data.iloc[mylist1,:][mylist].head()

Unnamed: 0,COMM_CODE,COMM_NAME
1759602,12E,12E
1774800,12E,12E
1775521,12E,12E
1776273,12E,12E
1779952,12E,12E


However,

In [14]:
mylist = Search_Columns('COMM')
mylist1 = np.where(Data['COMM_NAME'].isna())[0].tolist()
Temp = Data.iloc[mylist1,:]['LATITUDE'].tolist()
Data.iloc[mylist1,:][mylist]

Unnamed: 0,COMM_CODE,COMM_NAME
4553192,,
4972776,,


and we label these rows as Others

In [15]:
Data.iloc[mylist1,9]='Other'
Data.iloc[mylist1,10]='Other'
Data.iloc[mylist1,:][mylist]

Unnamed: 0,COMM_CODE,COMM_NAME
4553192,Other,Other
4972776,Other,Other


Moreover, modifying **COMM_NAME** column entries.

In [16]:
Data['COMM_NAME'] = Data['COMM_NAME'].apply(lambda x: x.title())

## Latitude, Longitude and Location Columns 

For these columns, note that

In [17]:
mylist = np.where(Data['location'].isna())[0].tolist()
Null = Data[Data.location.isna()][['ROLL_NUMBER','ADDRESS','LATITUDE','LONGITUDE','location']]
print('The number of missing values: %i' % len(Null))
Null.head()

The number of missing values: 31457


Unnamed: 0,ROLL_NUMBER,ADDRESS,LATITUDE,LONGITUDE,location
65,54009493,3030 3 AV NE,,,
461,202745584,202 CENTRE ST SE,,,
1746,202651097,1217 CENTRE ST NW,,,
1812,202735593,363 SAGE HILL CI NW,,,
16662,16109613,1705 RANCHLANDS WY NW,,,


We can use imputing with the mean strategy to recover these missing data in each community. Therefore,

In [18]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
mylist = Data.COMM_CODE.unique().tolist()
# Progressbar
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval=len(mylist),
                                       widgets=[progressbar.Bar('#', '|', '|'), progressbar.Percentage()])
Progress_Bar.start()

for i in mylist:
    # updating the progress bar
    Counter+=1
    Progress_Bar.update(Counter)
    # searching for the index of values in the community code column
    indx = np.where(Data['COMM_CODE']==i)[0].tolist()
    # LATITUDE
    Temp = Data.iloc[indx,:].LATITUDE
    if Temp.isnull().sum() < len(Temp):
        Temp = imp.fit_transform(Temp.values.reshape(-1, 1))
        Data.iloc[indx,11] = Temp
    del Temp
    # LONGITUDE
    Temp = Data.iloc[indx,:].LONGITUDE
    if Temp.isnull().sum() < len(Temp):
        Temp = imp.fit_transform(Temp.values.reshape(-1, 1))
        Data.iloc[indx,12] = Temp
    del Temp
# End of the seraching process
Progress_Bar.finish()
del mylist, imp, Null

|#########################################################################|100%


## Changing the Dataframe Column Names to Title Case

In [19]:
mylist = Data.columns.tolist()
mylist = [x.title() for x in mylist]
Data.columns = mylist
del mylist

## Adding More Details

### Loading a geojson file

In [20]:
Community_Boundaries = gpd.read_file('Calgary/Community_Boundaries.geojson')
# Chaning the headers to title
mylist = Community_Boundaries.columns.tolist()
mylist = [x.title() for x in mylist]
mylist = [x.replace('Geometry','geometry') for x in mylist]
Community_Boundaries.columns = mylist
del mylist

In [21]:
Community_Boundaries.head(4)

Unnamed: 0,Comm_Structure,Name,Sector,Class_Code,Srg,Class,Comm_Code,geometry
0,INNER CITY,SUNALTA,CENTRE,1,BUILT-OUT,Residential,SNA,"POLYGON ((-114.10061 51.04799, -114.10056 51.0..."
1,BUILDING OUT,WEST SPRINGS,WEST,1,DEVELOPING,Residential,WSP,"POLYGON ((-114.18781 51.06151, -114.18781 51.0..."
2,OTHER,12A,SOUTHEAST,4,,Residual Sub Area,12A,"POLYGON ((-113.86945 50.97957, -113.86945 50.9..."
3,1950s,WINDSOR PARK,CENTRE,1,BUILT-OUT,Residential,WND,"POLYGON ((-114.08152 51.00872, -114.08137 51.0..."


### Processing the geojson file

In [22]:
Temp = Community_Boundaries.drop(columns=['Name','geometry'])
N = len(Temp.iloc[:,:-1].columns.tolist())
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval=N, widgets=[progressbar.Bar('=', '|', '|'), progressbar.Percentage()])
Progress_Bar.start()
for i in range(N):
    Counter+=1
    Progress_Bar.update(Counter)
    Temp.iloc[:,i] = Temp.iloc[:,i].str.title()
# End of the seraching process
Progress_Bar.finish()



In [23]:
mylist = Data.Comm_Code.unique().tolist()
Temp = Temp[Temp.Comm_Code.isin(mylist)]
Temp1 = pd.merge(Data, Temp, how ='inner', on ='Comm_Code')
Data = Temp1.copy()
del Temp, Temp1

# Processed Data

In [24]:
Data.head()

Unnamed: 0,Roll_Year,Roll_Number,Address,Assessed_Value,Assessment_Class,Assessment_Class_Description,Re_Assessed_Value,Nr_Assessed_Value,Fl_Assessed_Value,Comm_Code,...,Latitude,Longitude,Location,Unique_Key,Year_Of_Construction,Comm_Structure,Sector,Class_Code,Srg,Class
0,2019,200483568,326 DISCOVERY RIDGE WY SW,695000.0,RE,Residential,695000.0,,,DIS,...,51.020122,-114.22117,"(51.0201223055886, -114.221169735847)",2019200483568,,2000S,West,1,Built-Out,Residential
1,2019,200799187,524 10 DISCOVERY RIDGE CL SW,307500.0,RE,Residential,307500.0,,,DIS,...,51.01284,-114.21412,"(51.0128402382281, -114.214119532073)",2019200799187,,2000S,West,1,Built-Out,Residential
2,2019,200932663,707V 20 DISCOVERY RIDGE CL SW,22000.0,RE,Residential,22000.0,,,DIS,...,51.012063,-114.212059,"(51.0120633983531, -114.212059076585)",2019200932663,,2000S,West,1,Built-Out,Residential
3,2019,200483857,283 DISCOVERY RIDGE WY SW,825500.0,RE,Residential,825500.0,,,DIS,...,51.019459,-114.219202,"(51.0194594801033, -114.219201892146)",2019200483857,,2000S,West,1,Built-Out,Residential
4,2019,201017654,883V 30 DISCOVERY RIDGE CL SW,22000.0,RE,Residential,22000.0,,,DIS,...,51.012957,-114.213223,"(51.0129565521349, -114.213223227739)",2019201017654,,2000S,West,1,Built-Out,Residential


In [26]:
Data_info(Data, True)

Unnamed: 0,Data Type,Number of NaN Values,Percentage
Address,object,186,0.0
Comm_Structure,object,115945,1.59
Fl_Assessed_Value,float64,7285658,99.89
Location,object,31367,0.43
Nr_Assessed_Value,float64,6953543,95.34
Re_Assessed_Value,float64,338018,4.63
Year_Of_Construction,float64,6772800,92.86


### Saving

In [27]:
Data.to_csv('Calgary/Property_Assessments_Clean.csv',index=False)

***