# Part 1 Data Preprocessing

In [4]:
# import packages
import numpy as np
import pandas as pd



## Read in data and check duplicated rows

In [19]:
df = pd.read_csv('df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,MLS No,Status,Address,Unit,DOM,City,LP,SP,BT,...,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip
0,0,ML81786575,SLD,1988 Prince George Drive,,7,SAN JOSE,"$299,000","$365,000",CO,...,1,Y,1.0,1983.0,325.0,,4/24/2020,37.0,Alum Rock,95116
1,1,ML81790737,SLD,81 Castlebridge Drive,,35,SAN JOSE,"$299,888","$380,000",CO,...,1,Y,1.0,1983.0,330.0,,6/30/2020,37.0,Alum Rock,95116
2,2,40937495,SLD,1158 Lick Ave,,6,SAN JOSE,"$299,950","$570,500",DE,...,1,Y,2.0,1901.0,,,3/2/2021,120.0,Santa Clara County,95110
3,3,ML81750235,SLD,247 N Capitol Avenue,227.0,47,SAN JOSE,"$330,000","$330,000",CO,...,1,,0.0,1977.0,310.0,,7/31/2019,42.0,Alum Rock,95127
4,4,ML81819190,SLD,78 Castlebridge Drive,,3,SAN JOSE,"$333,000","$360,000",CO,...,1,Y,1.0,1983.0,336.0,,11/20/2020,37.0,Alum Rock,95116


In [7]:
print('Number of Duplicated rows:',sum(df.duplicated())) #check if there is duplicated line

Number of Duplicated rows: 0


## Drop the columns that we don't need

In [8]:
#drop features that we don't need
df = df.drop(columns = ['MLS No', 'Status', 'Address', 'Unit','City'])

## Change sell price and list price to numeric

In [9]:
#change the type of LP and SP to numeric
def remove(text):
    return text.replace('$','')
def removedot(text):
    return text.replace(',','')
df['LP'] = pd.to_numeric(df['LP'].apply(remove).apply(removedot))
df['SP'] = pd.to_numeric(df['SP'].apply(remove).apply(removedot))

## Change closing date to datetime and train test split by closing date

In [16]:
df['Closing Date'] = pd.to_datetime(df['Closing Date'])

## train test split by closing date

In [13]:
#output train test dataset

df.sort_values(by = 'Closing Date', ignore_index=True, inplace = True)
#df = df.drop(columns= 'Unnamed: 0')
train = df.iloc[0:7000]
test = df.iloc[7000:]
train.to_csv('train.csv')
test.to_csv('test.csv')

AttributeError: 'NoneType' object has no attribute 'iloc'

In [17]:
df.sort_values(by = 'Closing Date', ignore_index=True, inplace = True)

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,MLS No,Status,Address,Unit,DOM,City,LP,SP,BT,...,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip
0,4873,ML81731293,SLD,904 Monarch Circle,,17,SAN JOSE,"$769,000","$759,000",CO,...,2,Y,2.0,1999.0,341.0,,2019-01-02,19.0,Santa Teresa,95138
1,4616,ML81730692,SLD,1116 Waterton Lane,,0,SAN JOSE,"$750,000","$750,000",TH,...,2,Y,2.0,1979.0,350.0,,2019-01-02,39.0,Berryessa,95131
2,4610,ML81733246,SLD,7446 Tulare Hill Drive,,7,SAN JOSE,"$750,000","$785,000",TH,...,2,Y,2.0,1982.0,445.0,,2019-01-02,36.0,Santa Teresa,95139
3,3019,ML81731495,SLD,14165 Eton,,15,SAN JOSE,"$674,888","$723,000",DE,...,3,,0.0,1953.0,,,2019-01-02,65.0,Alum Rock,95127
4,2682,ML81727286,SLD,666 Teatree Court,,51,SAN JOSE,"$649,000","$636,000",CO,...,2,,0.0,1988.0,460.0,,2019-01-02,30.0,Campbell,95128


In [6]:
test

Unnamed: 0,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip
7000,12,889500,905000,CO,1464,2,2,Y,2.0,2013.0,210,M,2020-12-22,7.0,Berryessa,95132
7001,6,799999,935000,DE,1398,4,2,Y,2.0,1971.0,,,2020-12-22,49.0,Evergreen,95148
7002,11,799998,860000,DE,1353,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Alum Rock,95127
7003,25,549000,550000,CO,1094,2,2,,0.0,1991.0,463,M,2020-12-23,29.0,South San Jose,95122
7004,10,949000,985000,DE,1065,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Blossom Valley,95123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,28,619000,620000,CO,850,2,2,Y,1.0,1992.0,349,M,2021-10-08,29.0,Santa Teresa,95138
9996,18,888000,912000,DE,1272,3,2,Y,2.0,1915.0,,,2021-10-08,106.0,Central San Jose,95116
9997,33,739950,735000,CO,1006,2,2,,0.0,1991.0,400,M,2021-10-08,30.0,Campbell,95128
9998,5,667053,675000,CO,1233,2,2,Y,1.0,1985.0,535,M,2021-10-08,36.0,Blossom Valley,95123


# Missing Data

In [15]:
#Train Data
#Garage - categorize null as a category
train['Gar'] = train['Gar'].fillna('N/A')
#Frequency - convert missing as a category
train['Freq'] = train['Freq'].fillna('N/A')
#Garage Space - drop rows with missing value
#Year Built - drop rows with missing value
train = train.dropna(subset=['GarSp','YrBlt'])
#Age - fill with the time difference in year between YrBlt and Closing Date
train['Age'] = train['Age'].fillna(pd.DatetimeIndex(train['Closing Date']).year - train['YrBlt'])
#summarize train data set
print(train.info())
#preview data set
train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6990 entries, 0 to 6999
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DOM            6990 non-null   int64         
 1   LP             6990 non-null   int64         
 2   SP             6990 non-null   int64         
 3   BT             6990 non-null   object        
 4   SqFt           6990 non-null   int64         
 5   BR             6990 non-null   int64         
 6   Bth            6990 non-null   int64         
 7   Gar            6990 non-null   object        
 8   GarSp          6990 non-null   float64       
 9   YrBlt          6990 non-null   float64       
 10  HOA Fee        3876 non-null   object        
 11  Freq           6990 non-null   object        
 12  Closing Date   6990 non-null   datetime64[ns]
 13  Age            6990 non-null   float64       
 14  Area           6990 non-null   object        
 15  Zip            6990 n

Unnamed: 0,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date,week_in_month
0,17,769000,759000,CO,1257,2,2,Y,2.0,1999.0,341.0,,2019-01-02,19.0,Santa Teresa,95138,1,1
1,0,750000,750000,TH,1444,2,2,Y,2.0,1979.0,350.0,,2019-01-02,39.0,Berryessa,95131,1,1
2,7,750000,785000,TH,1655,3,2,Y,2.0,1982.0,445.0,,2019-01-02,36.0,Santa Teresa,95139,1,1
3,15,674888,723000,DE,1408,3,3,,0.0,1953.0,,,2019-01-02,65.0,Alum Rock,95127,1,1
4,51,649000,636000,CO,922,2,2,,0.0,1988.0,460.0,,2019-01-02,30.0,Campbell,95128,1,1


In [16]:
#Test Data
#Garage - categorize null as a category
test['Gar'] = test['Gar'].fillna('N/A')
#Frequency - convert missing as a category
test['Freq'] = test['Freq'].fillna('N/A')
#Garage Space - drop rows with missing value
#Year Built - drop rows with missing value
test = test.dropna(subset=['GarSp','YrBlt'])
#Age - fill na with the time difference in year between YrBlt and Closing Date
test['Age'] = test['Age'].fillna(pd.DatetimeIndex(test['Closing Date']).year - test['YrBlt'])
#summarize test data set
print(test.info())
#preview data set
test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2996 entries, 7000 to 9999
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DOM            2996 non-null   int64         
 1   LP             2996 non-null   int64         
 2   SP             2996 non-null   int64         
 3   BT             2996 non-null   object        
 4   SqFt           2996 non-null   int64         
 5   BR             2996 non-null   int64         
 6   Bth            2996 non-null   int64         
 7   Gar            2996 non-null   object        
 8   GarSp          2996 non-null   float64       
 9   YrBlt          2996 non-null   float64       
 10  HOA Fee        2131 non-null   object        
 11  Freq           2996 non-null   object        
 12  Closing Date   2996 non-null   datetime64[ns]
 13  Age            2996 non-null   float64       
 14  Area           2996 non-null   object        
 15  Zip            299

Unnamed: 0,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date,week_in_month
7000,12,889500,905000,CO,1464,2,2,Y,2.0,2013.0,210.0,M,2020-12-22,7.0,Berryessa,95132,12,4
7001,6,799999,935000,DE,1398,4,2,Y,2.0,1971.0,,,2020-12-22,49.0,Evergreen,95148,12,4
7002,11,799998,860000,DE,1353,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Alum Rock,95127,12,4
7003,25,549000,550000,CO,1094,2,2,,0.0,1991.0,463.0,M,2020-12-23,29.0,South San Jose,95122,12,4
7004,10,949000,985000,DE,1065,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Blossom Valley,95123,12,4


# Plot sell price with closing date

In [1]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

df['logSP'] = np.log(df['SP'])
fig = make_subplots(rows=1, cols=2,subplot_titles=("Selling price","Log Selling Price"))

fig.add_trace(
    go.Scatter(x=df['Closing Date'], y=df['SP'],mode = 'markers', name = 'Selling Price'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df['Closing Date'], y=df['logSP'],mode = 'markers', name = 'Log Selling Price'),
    row=1, col=2
)

fig.update_layout(height=400, width=1000)
fig.show()

ModuleNotFoundError: No module named 'plotly'

# Extract month and week of month as feature

In [10]:
import warnings
warnings.filterwarnings('ignore')

train['month_of_date'] = train['Closing Date'].dt.month
test['month_of_date'] = test['Closing Date'].dt.month


In [11]:
test

Unnamed: 0,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date
7000,12,889500,905000,CO,1464,2,2,Y,2.0,2013.0,210,M,2020-12-22,7.0,Berryessa,95132,12
7001,6,799999,935000,DE,1398,4,2,Y,2.0,1971.0,,,2020-12-22,49.0,Evergreen,95148,12
7002,11,799998,860000,DE,1353,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Alum Rock,95127,12
7003,25,549000,550000,CO,1094,2,2,,0.0,1991.0,463,M,2020-12-23,29.0,South San Jose,95122,12
7004,10,949000,985000,DE,1065,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Blossom Valley,95123,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,28,619000,620000,CO,850,2,2,Y,1.0,1992.0,349,M,2021-10-08,29.0,Santa Teresa,95138,10
9996,18,888000,912000,DE,1272,3,2,Y,2.0,1915.0,,,2021-10-08,106.0,Central San Jose,95116,10
9997,33,739950,735000,CO,1006,2,2,,0.0,1991.0,400,M,2021-10-08,30.0,Campbell,95128,10
9998,5,667053,675000,CO,1233,2,2,Y,1.0,1985.0,535,M,2021-10-08,36.0,Blossom Valley,95123,10


In [12]:
import math

def add_week_of_month(df):
    df['week_in_month'] = pd.to_numeric(df['Closing Date'].dt.day/7)
    df['week_in_month'] = df['week_in_month'].apply(lambda x: math.ceil(x))
    return df

In [13]:
add_week_of_month(train)

Unnamed: 0,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date,week_in_month
0,17,769000,759000,CO,1257,2,2,Y,2.0,1999.0,341,,2019-01-02,19.0,Santa Teresa,95138,1,1
1,0,750000,750000,TH,1444,2,2,Y,2.0,1979.0,350,,2019-01-02,39.0,Berryessa,95131,1,1
2,7,750000,785000,TH,1655,3,2,Y,2.0,1982.0,445,,2019-01-02,36.0,Santa Teresa,95139,1,1
3,15,674888,723000,DE,1408,3,3,,0.0,1953.0,,,2019-01-02,65.0,Alum Rock,95127,1,1
4,51,649000,636000,CO,922,2,2,,0.0,1988.0,460,,2019-01-02,30.0,Campbell,95128,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,7,985000,1108000,DE,1260,3,2,Y,2.0,1955.0,,,2020-12-22,65.0,Santa Clara County,95124,12,4
6996,4,849000,822000,CO,1622,2,2,Y,2.0,1998.0,726,M,2020-12-22,22.0,Central San Jose,95112,12,4
6997,1,715000,766000,CO,1168,2,2,Y,2.0,1996.0,260,M,2020-12-22,24.0,South San Jose,95136,12,4
6998,9,887999,894000,DE,1568,4,2,Y,2.0,1998.0,50,M,2020-12-22,22.0,South San Jose,95111,12,4


In [14]:
add_week_of_month(test)

Unnamed: 0,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date,week_in_month
7000,12,889500,905000,CO,1464,2,2,Y,2.0,2013.0,210,M,2020-12-22,7.0,Berryessa,95132,12,4
7001,6,799999,935000,DE,1398,4,2,Y,2.0,1971.0,,,2020-12-22,49.0,Evergreen,95148,12,4
7002,11,799998,860000,DE,1353,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Alum Rock,95127,12,4
7003,25,549000,550000,CO,1094,2,2,,0.0,1991.0,463,M,2020-12-23,29.0,South San Jose,95122,12,4
7004,10,949000,985000,DE,1065,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Blossom Valley,95123,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,28,619000,620000,CO,850,2,2,Y,1.0,1992.0,349,M,2021-10-08,29.0,Santa Teresa,95138,10,2
9996,18,888000,912000,DE,1272,3,2,Y,2.0,1915.0,,,2021-10-08,106.0,Central San Jose,95116,10,2
9997,33,739950,735000,CO,1006,2,2,,0.0,1991.0,400,M,2021-10-08,30.0,Campbell,95128,10,2
9998,5,667053,675000,CO,1233,2,2,Y,1.0,1985.0,535,M,2021-10-08,36.0,Blossom Valley,95123,10,2


In [None]:
# New Section