# Data Exploration 

In [1]:
DATA_PATH = "../data/raw/V3.csv"
EXPORT_PATH = "../data/processed/V3Explored.csv"

In [2]:
import pandas as pd 

# Read Data

In [3]:
df_raw = pd.read_csv(DATA_PATH)
df_raw.shape

(10000, 12)

In [4]:
df = df_raw.copy()

In [5]:
df.sample(3)

Unnamed: 0,Date,Year,Locality,Estimated Value,Sale Price,Property,Residential,num_rooms,num_bathrooms,carpet_area,property_tax_rate,Face
7046,2019-06-16,2019,Fairfield,775670.0,1250000.0,?,Detached House,3,2,1024.0,1.025899,West
7139,2019-07-17,2019,Bridgeport,182500.0,329000.0,Single Family,Detached House,3,2,980.0,1.025899,South
2609,2012-12-03,2012,Norwalk,310100.0,359000.0,Single Family,Detached House,3,2,,1.021958,West


In [6]:
df.sample().iloc[0]

Date                     2022-03-06
Year                           2022
Locality                  Waterbury
Estimated Value                 NaN
Sale Price                 160000.0
Property              Single Family
Residential          Detached House
num_rooms                         3
num_bathrooms                     2
carpet_area                   934.0
property_tax_rate          1.003979
Face                           East
Name: 9324, dtype: object

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               10000 non-null  object 
 1   Year               10000 non-null  int64  
 2   Locality           8745 non-null   object 
 3   Estimated Value    8771 non-null   float64
 4   Sale Price         10000 non-null  float64
 5   Property           10000 non-null  object 
 6   Residential        10000 non-null  object 
 7   num_rooms          10000 non-null  int64  
 8   num_bathrooms      10000 non-null  int64  
 9   carpet_area        8718 non-null   float64
 10  property_tax_rate  10000 non-null  float64
 11  Face               10000 non-null  object 
dtypes: float64(4), int64(3), object(5)
memory usage: 937.6+ KB


In [8]:
df.columns = df.columns.str.lower()

In [9]:
df.columns

Index(['date', 'year', 'locality', 'estimated value', 'sale price', 'property',
       'residential', 'num_rooms', 'num_bathrooms', 'carpet_area',
       'property_tax_rate', 'face'],
      dtype='object')

In [10]:
df.describe()

Unnamed: 0,year,estimated value,sale price,num_rooms,num_bathrooms,carpet_area,property_tax_rate
count,10000.0,8771.0,10000.0,10000.0,10000.0,8718.0,10000.0
mean,2015.8801,444584.2,622474.8,3.3296,2.334,1111.658981,1.143517
std,4.088031,751356.5,1029709.0,0.885123,1.209293,305.058455,0.166762
min,2009.0,0.0,2000.0,3.0,1.0,900.0,1.003979
25%,2012.0,130525.0,160000.0,3.0,1.0,960.0,1.023495
50%,2016.0,243180.0,340000.0,3.0,2.0,1021.0,1.025899
75%,2019.0,441830.0,630000.0,3.0,3.0,1082.0,1.348259
max,2022.0,21119910.0,25750000.0,8.0,8.0,2989.0,1.422308


In [11]:
df.describe(include="object")

Unnamed: 0,date,locality,property,residential,face
count,10000,8745,10000,10000,10000
unique,3089,7,5,4,4
top,2021-07-02,Bridgeport,Single Family,Detached House,North
freq,22,1687,6797,8296,2535


In [12]:
df.describe(include="all")

Unnamed: 0,date,year,locality,estimated value,sale price,property,residential,num_rooms,num_bathrooms,carpet_area,property_tax_rate,face
count,10000,10000.0,8745,8771.0,10000.0,10000,10000,10000.0,10000.0,8718.0,10000.0,10000
unique,3089,,7,,,5,4,,,,,4
top,2021-07-02,,Bridgeport,,,Single Family,Detached House,,,,,North
freq,22,,1687,,,6797,8296,,,,,2535
mean,,2015.8801,,444584.2,622474.8,,,3.3296,2.334,1111.658981,1.143517,
std,,4.088031,,751356.5,1029709.0,,,0.885123,1.209293,305.058455,0.166762,
min,,2009.0,,0.0,2000.0,,,3.0,1.0,900.0,1.003979,
25%,,2012.0,,130525.0,160000.0,,,3.0,1.0,960.0,1.023495,
50%,,2016.0,,243180.0,340000.0,,,3.0,2.0,1021.0,1.025899,
75%,,2019.0,,441830.0,630000.0,,,3.0,3.0,1082.0,1.348259,


In [13]:
questionable_cat_cols = [ "locality", "property", "residential", "face"]

for col in questionable_cat_cols:
    print(col) 
    print(df[col].unique().tolist())
    print("-------------------------")
    print()
    

locality
['Waterbury', nan, 'Norwalk', 'Bridgeport', 'Greenwich', 'Fairfield', 'West Hartford', 'Stamford']
-------------------------

property
['Single Family', '?', 'Two Family', 'Three Family', 'Four Family']
-------------------------

residential
['Detached House', 'Duplex', 'Triplex', 'Fourplex']
-------------------------

face
['South', 'North', 'East', 'West']
-------------------------



In [14]:
questionable_num_cols = ['date', 'year', 'estimated value', 'sale price', 'num_rooms', 'num_bathrooms', 'carpet_area','property_tax_rate']
for col in questionable_num_cols: 
    if df[col].isna().any():
        print(col, ": have missing")

estimated value : have missing
carpet_area : have missing


# Export Data 

In [15]:
df.to_pickle(EXPORT_PATH)