# Data Preprocessing

## Setup

In [49]:
import pandas as pd
import numpy as np

path_properties = '../data/properties.csv'
path_customers = '../data/customers.csv'
path_preproccesed = '../data/preprocessed.csv'

---

## Properties

In [2]:
raw_properties_df = pd.read_csv(path_properties, sep=',')
raw_properties_df.head()

Unnamed: 0,index,property_id,building,date_sale,type,property#,area,price,status,customerid
0,0,1030,1,11/1/2005,Apartment,30,743.09,"$246,172.68",Sold,C0028
1,1,1029,1,10/1/2005,Apartment,29,756.21,"$246,331.90",Sold,C0027
2,2,2002,2,7/1/2007,Apartment,2,587.28,"$209,280.91",Sold,C0112
3,3,2031,2,12/1/2007,Apartment,31,1604.75,"$452,667.01",Sold,C0160
4,4,1049,1,11/1/2004,Apartment,49,1375.45,"$467,083.31",Sold,C0014


In [3]:
raw_properties_df.dtypes

index            int64
property_id      int64
building         int64
date_sale       object
type            object
property#        int64
area           float64
price           object
status          object
customerid      object
dtype: object

In [4]:
raw_properties_df.isnull().sum()

index           0
property_id     0
building        0
date_sale       0
type            0
property#       0
area            0
price           0
status          0
customerid     72
dtype: int64

In [5]:
raw_properties_df.columns

Index(['index', 'property_id', 'building', 'date_sale', 'type', 'property#',
       'area', 'price', 'status', 'customerid'],
      dtype='object')

In [6]:
work_properties_df = raw_properties_df.copy()

### Categorical to numerical

#### `status`

In [7]:
work_properties_df['status'] = work_properties_df['status'].map(lambda x: x.strip()) 

In [8]:
work_properties_df['status']

0      Sold
1      Sold
2      Sold
3      Sold
4      Sold
       ... 
262       -
263       -
264       -
265       -
266       -
Name: status, Length: 267, dtype: object

In [9]:
work_properties_df['status'] = work_properties_df['status'].map({'Sold':1, '-':0})

In [10]:
work_properties_df['status']

0      1
1      1
2      1
3      1
4      1
      ..
262    0
263    0
264    0
265    0
266    0
Name: status, Length: 267, dtype: int64

In [11]:
work_properties_df['status'].value_counts()

status
1    195
0     72
Name: count, dtype: int64

#### `type`

In [12]:
set(work_properties_df['type'])

{'Apartment', 'Office'}

In [13]:
work_properties_df['type'] = work_properties_df['type'].map({'Apartment':1, 'Office':0})

In [14]:
work_properties_df.head()

Unnamed: 0,index,property_id,building,date_sale,type,property#,area,price,status,customerid
0,0,1030,1,11/1/2005,1,30,743.09,"$246,172.68",1,C0028
1,1,1029,1,10/1/2005,1,29,756.21,"$246,331.90",1,C0027
2,2,2002,2,7/1/2007,1,2,587.28,"$209,280.91",1,C0112
3,3,2031,2,12/1/2007,1,31,1604.75,"$452,667.01",1,C0160
4,4,1049,1,11/1/2004,1,49,1375.45,"$467,083.31",1,C0014


In [15]:
work_properties_df.dtypes

index            int64
property_id      int64
building         int64
date_sale       object
type             int64
property#        int64
area           float64
price           object
status           int64
customerid      object
dtype: object

#### `price`

In [16]:
work_properties_df['price'] = work_properties_df['price'].map(lambda x: x.replace('$', '').replace(',', '')).astype(float)

In [17]:
work_properties_df['price']

0      246172.68
1      246331.90
2      209280.91
3      452667.01
4      467083.31
         ...    
262    322610.74
263    279191.26
264    287996.53
265    365868.78
266    199216.40
Name: price, Length: 267, dtype: float64

#### Final modifications

In [18]:
work_properties_df['date_sale'] = pd.to_datetime(work_properties_df['date_sale'], errors='coerce')

In [19]:
work_properties_df['customerid'] = work_properties_df['customerid'].map(lambda x: x.strip() if isinstance(x, str) else x)

In [20]:
work_properties_df.drop('index', axis=1, inplace=True)

#### Result

In [21]:
work_properties_df.head()

Unnamed: 0,property_id,building,date_sale,type,property#,area,price,status,customerid
0,1030,1,2005-11-01,1,30,743.09,246172.68,1,C0028
1,1029,1,2005-10-01,1,29,756.21,246331.9,1,C0027
2,2002,2,2007-07-01,1,2,587.28,209280.91,1,C0112
3,2031,2,2007-12-01,1,31,1604.75,452667.01,1,C0160
4,1049,1,2004-11-01,1,49,1375.45,467083.31,1,C0014


In [22]:
work_properties_df.dtypes

property_id             int64
building                int64
date_sale      datetime64[ns]
type                    int64
property#               int64
area                  float64
price                 float64
status                  int64
customerid             object
dtype: object

In [23]:
preprocessed_properties_df = work_properties_df.copy()

## Customers

We can repeat the process for the `customers` dataset.

In [24]:
raw_customers_df = pd.read_csv(path_customers, sep=',')
raw_customers_df.head()

Unnamed: 0,index,customerid,entity,name,surname,birth_date,sex,country,state,purpose,deal_satisfaction,mortgage,source
0,0,C0110,Individual,Kareem,Liu,5/11/1968,F,USA,California,Home,4,Yes,Website
1,1,C0010,Individual,Trystan,Oconnor,11/26/1962,M,USA,California,Home,1,No,Website
2,2,C0132,Individual,Kale,Gay,4/7/1959,M,USA,California,Home,4,Yes,Agency
3,3,C0137,Individual,Russell,Gross,11/25/1959,M,USA,California,Home,5,No,Website
4,4,C0174,Company,Marleez,Co,,,USA,California,Investment,5,No,Website


In [25]:
raw_customers_df.dtypes

index                 int64
customerid           object
entity               object
name                 object
surname              object
birth_date           object
sex                  object
country              object
state                object
purpose              object
deal_satisfaction     int64
mortgage             object
source               object
dtype: object

In [26]:
raw_customers_df.isnull().sum()

index                0
customerid           0
entity               0
name                 0
surname              0
birth_date           7
sex                  7
country              0
state                8
purpose              0
deal_satisfaction    0
mortgage             0
source               0
dtype: int64

In [27]:
raw_customers_df.columns

Index(['index', 'customerid', 'entity', 'name', 'surname', 'birth_date', 'sex',
       'country', 'state', 'purpose', 'deal_satisfaction', 'mortgage',
       'source'],
      dtype='object')

In [28]:
work_customers_df = raw_customers_df.copy()

### Categorical to numerical

#### `entity`

In [29]:
set(work_customers_df['entity'])

{'Company', 'Individual'}

In [30]:
work_customers_df['entity'] = work_customers_df['entity'].map({'Company':1, 'Individual':0})

#### `sex`

In [31]:
set(work_customers_df['sex'])

{'F', 'M', nan}

In [32]:
work_customers_df['sex'] = work_customers_df['sex'].map({'F':1, 'M':0})

#### `purpose`

In [33]:
set(work_customers_df['purpose'])

{'Home', 'Investment'}

In [34]:
work_customers_df['purpose'] = work_customers_df['purpose'].map({'Investment':1, 'Home':0})

#### `mortgage`

In [35]:
set(work_customers_df['mortgage'])

{'No', 'Yes'}

In [36]:
work_customers_df['mortgage'] = work_customers_df['mortgage'].map({'Yes':1, 'No':0})

#### Nominal variables

We aren't going to use `drop_first` because we want to retain all categories for later visualization.

In [37]:
dummies_customers_df = pd.get_dummies(data=work_customers_df, columns=['source'])

In [38]:
dummies_customers_df.dtypes

index                  int64
customerid            object
entity                 int64
name                  object
surname               object
birth_date            object
sex                  float64
country               object
state                 object
purpose                int64
deal_satisfaction      int64
mortgage               int64
source_Agency           bool
source_Client           bool
source_Website          bool
dtype: object

#### Final modifications

In [39]:
dummies_customers_df['birth_date'] = pd.to_datetime(dummies_customers_df['birth_date'], errors='coerce')

In [40]:
dummies_customers_df.drop('index', axis=1, inplace=True)

#### Result

In [41]:
dummies_customers_df.head()

Unnamed: 0,customerid,entity,name,surname,birth_date,sex,country,state,purpose,deal_satisfaction,mortgage,source_Agency,source_Client,source_Website
0,C0110,0,Kareem,Liu,1968-05-11,1.0,USA,California,0,4,1,False,False,True
1,C0010,0,Trystan,Oconnor,1962-11-26,0.0,USA,California,0,1,0,False,False,True
2,C0132,0,Kale,Gay,1959-04-07,0.0,USA,California,0,4,1,True,False,False
3,C0137,0,Russell,Gross,1959-11-25,0.0,USA,California,0,5,0,False,False,True
4,C0174,1,Marleez,Co,NaT,,USA,California,1,5,0,False,False,True


In [42]:
dummies_customers_df.dtypes

customerid                   object
entity                        int64
name                         object
surname                      object
birth_date           datetime64[ns]
sex                         float64
country                      object
state                        object
purpose                       int64
deal_satisfaction             int64
mortgage                      int64
source_Agency                  bool
source_Client                  bool
source_Website                 bool
dtype: object

In [43]:
preprocessed_customers_df = dummies_customers_df.copy()

## Merge

Merge the datasets using `customer_id` column. In this case, the join must be `LEFT` from the table *PROPERTIES* to *CUSTOMERS*.

In [44]:
preprocessed_properties_df['customerid'].isna().sum() # Not every property is linked to a customer

np.int64(72)

In [45]:
preprocessed_customers_df['customerid'].isna().sum()

np.int64(0)

In [46]:
new_df = pd.merge(preprocessed_properties_df, preprocessed_customers_df, on='customerid', how='left')

In [47]:
new_df.shape

(267, 22)

In [48]:
new_df.head()

Unnamed: 0,property_id,building,date_sale,type,property#,area,price,status,customerid,entity,...,birth_date,sex,country,state,purpose,deal_satisfaction,mortgage,source_Agency,source_Client,source_Website
0,1030,1,2005-11-01,1,30,743.09,246172.68,1,C0028,0.0,...,1986-06-21,1.0,USA,California,0.0,5.0,0.0,False,False,True
1,1029,1,2005-10-01,1,29,756.21,246331.9,1,C0027,0.0,...,1983-02-24,1.0,USA,California,0.0,5.0,0.0,False,False,True
2,2002,2,2007-07-01,1,2,587.28,209280.91,1,C0112,0.0,...,1985-12-27,0.0,USA,California,0.0,1.0,1.0,False,True,False
3,2031,2,2007-12-01,1,31,1604.75,452667.01,1,C0160,0.0,...,1985-12-27,0.0,USA,California,1.0,3.0,1.0,False,False,True
4,1049,1,2004-11-01,1,49,1375.45,467083.31,1,C0014,0.0,...,1979-05-15,1.0,USA,California,0.0,4.0,0.0,True,False,False


In [50]:
new_df.to_csv(path_preproccesed, index=False)