## Real Estate Dataset - Data Cleaning and Preprocessing 

 This notebok is focused on the data preparation cleaning and preprocessing Real Estate Dataset

In [2]:
import pandas as pd 
import numpy as np 
import os 

## Define and Create Paths

In [3]:
# Get working directory 
current_dir = os.getcwd()

# Go one directory up to the root directory 
project_root_dir = os.path.dirname(current_dir)

data_dir = os.path.join(project_root_dir, 'data')
raw_dir = os.path.join(data_dir,'raw')
processed_dir = os.path.join(data_dir,'processed')

# Define paths to results folder 
results_dir = os.path.join(project_root_dir,'results')

# Define paths to docs folder 
docs_dir = os.path.join(project_root_dir,'docs') 

#Create directories if they do not exist 
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(processed_dir,exist_ok= True)
os.makedirs(results_dir,exist_ok= True)
os.makedirs(data_dir,exist_ok= True)

 ## Read in the data

In [9]:
real_estate_filename = os.path.join(raw_dir, 'Real_Estate_Sales_2016-2019_GL.csv')
real_estate = pd.read_csv(real_estate_filename, na_values='?',low_memory=False, skipinitialspace= True)
# Reading dataset with its header
real_estate.head(10)

Unnamed: 0,Serial Number,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type
0,160091,2016,12/23/2016,Avon,2 EDGEWOOD,143390,224000.0,0.640134,Condo,Condo
1,160172,2016,4/13/2017,Bethel,66 H NASHVILLE ROAD,80500,130000.0,0.619231,Condo,Condo
2,160258,2016,6/29/2017,Bethel,1 EAGLE ROCK HILL,117180,200000.0,0.5859,Condo,Condo
3,16233,2016,6/9/2017,Darien,54 KENSETT LANE,887600,1495000.0,0.593712,Condo,Condo
4,1600249,2016,5/12/2017,Guilford,66-10 HIGH ST,409250,665000.0,0.615414,Condo,Condo
5,160103,2016,11/30/2016,Branford,137 PEDDLARS DR,106700,155000.0,0.688387,Condo,Condo
6,167930,2016,9/27/2017,Bridgeport,95 LANCE CIR,105820,148000.0,0.715,Condo,Condo
7,16533,2016,8/3/2017,Newington,239 STERLING DR,238150,331100.0,0.719269,Condo,Condo
8,16005,2016,10/11/2016,Durham,83 STAGECOACH RD,114590,160000.0,0.716187,Condo,Condo
9,160239,2016,6/9/2017,New Canaan,29 MAPLE ST #3,388640,436000.0,0.891376,Condo,Condo


In [6]:
real_estate.shape

(183278, 10)

In [7]:
real_estate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183278 entries, 0 to 183277
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       183278 non-null  object
 1   1       183278 non-null  object
 2   2       183278 non-null  object
 3   3       183278 non-null  object
 4   4       183275 non-null  object
 5   5       183278 non-null  object
 6   6       183278 non-null  object
 7   7       183278 non-null  object
 8   8       183278 non-null  object
 9   9       183278 non-null  object
dtypes: object(10)
memory usage: 14.0+ MB


In [10]:
real_estate.columns

Index(['Serial Number', 'List Year', 'Date Recorded', 'Town', 'Address',
       'Assessed Value', 'Sale Amount', 'Sales Ratio', 'Property Type',
       'Residential Type'],
      dtype='object')

## Data Cleaning


## understanding the datasets
This dataset contains records of residential property sales, including details such as location, assessed value, sale amount, and property type. The data can be used to analyze property values, sale prices, and sales ratios across different towns and residential property types.
- Serial Number: represents  the unique number for each recorded property
- List Year: Represents the period of recording as we know it is for 3 years 2016-2019
- Date Recorded: this represents the actual date that a transaction took place
- Town: represents a specific towm in USA
- Address: uniquely identifies property location
- Assessed Value : The value estimated for tax or official purposes
- Sale Amount: The actual price the property was sold for
- Sales Ratio: The ratio of assessed value to sale amount
- Property Type:  Indicates the property is residential
-  Residential Type: The type of residential property (e.g. single-family, condo) 

In [17]:
# np.unique(real_estate.Town.to_list())

In [16]:
for col in real_estate.columns:
    uniques = np.unique(real_estate[col].to_list())
    print(f"Unique values in '{col}': {uniques}\n")


Unique values in 'Serial Number': [       161        162        163 ...  190700026  192000007 1710011174]

Unique values in 'List Year': [2016 2017 2018 2019]

Unique values in 'Date Recorded': ['1/1/2018' '1/1/2019' '1/10/2017' ... '9/9/2018' '9/9/2019' '9/9/2020']

Unique values in 'Town': ['Andover' 'Ansonia' 'Ashford' 'Avon' 'Barkhamsted' 'Beacon Falls'
 'Berlin' 'Bethany' 'Bethel' 'Bethlehem' 'Bloomfield' 'Bolton' 'Bozrah'
 'Branford' 'Bridgeport' 'Bridgewater' 'Bristol' 'Brookfield' 'Brooklyn'
 'Burlington' 'Canaan' 'Canterbury' 'Canton' 'Chaplin' 'Cheshire'
 'Chester' 'Clinton' 'Colchester' 'Colebrook' 'Columbia' 'Cornwall'
 'Coventry' 'Cromwell' 'Danbury' 'Darien' 'Deep River' 'Derby' 'Durham'
 'East Granby' 'East Haddam' 'East Hampton' 'East Hartford' 'East Haven'
 'East Lyme' 'East Windsor' 'Eastford' 'Easton' 'Ellington' 'Enfield'
 'Essex' 'Fairfield' 'Farmington' 'Franklin' 'Glastonbury' 'Goshen'
 'Granby' 'Greenwich' 'Griswold' 'Groton' 'Guilford' 'Haddam' 'Hamden'
 'Hampt

### 2. Dealing with Missing Values

In [18]:
real_estate.isnull().sum()

Serial Number       0
List Year           0
Date Recorded       0
Town                0
Address             3
Assessed Value      0
Sale Amount         0
Sales Ratio         0
Property Type       0
Residential Type    0
dtype: int64

In [20]:
real_estate['Address'] = real_estate['Address'].fillna('unknown')

In [21]:
real_estate.isnull().sum()

Serial Number       0
List Year           0
Date Recorded       0
Town                0
Address             0
Assessed Value      0
Sale Amount         0
Sales Ratio         0
Property Type       0
Residential Type    0
dtype: int64

### 3. Deal with Duplicates

In [22]:
real_estate.duplicated().sum()

0

In [33]:
real_estate['Residential Category'].unique()

array(['Condo', 'Single Family', 'Two Family', 'Three Family',
       'Four Family'], dtype=object)

In [34]:
real_estate['Residential Category'] = real_estate['Residential Category'].replace({
    'Condo': 'appartments',
    'Single Family': 'single-residence',
    'Two Family': 'duplex-residence',
    'Three Family': 'triplex-residence',
    'Four Family': 'quadplex-residence'
     
})

In [35]:
real_estate['Residential Category'].unique()


array(['appartments', 'single-residence', 'duplex-residence',
       'triplex-residence', 'quadplex-residence'], dtype=object)

In [26]:
real_estate = real_estate.rename(columns={'Residential Type': 'Residential Category'})


In [36]:
real_estate['Residential Category'].unique()

array(['appartments', 'single-residence', 'duplex-residence',
       'triplex-residence', 'quadplex-residence'], dtype=object)

In [37]:
final_file  = os.path.join(processed_dir,'real_estate_cleaned.csv')
real_estate.to_csv(final_file, index=False)