# Read in data and check duplicated rows

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('/Users/yuhancheng/Downloads/df.csv')
print('Number of Duplicated rows:',sum(df.duplicated())) #check if there is duplicated line

# Drop the columns that we don't need

In [4]:
#drop features that we don't need
df = df.drop(columns = ['MLS No', 'Status', 'Address', 'City'])

# Change sell price and list price to numeric

In [6]:
#change the type of LP and SP to numeric
def remove(text):
    return text.replace('$','')
def removedot(text):
    return text.replace(',','')
df['LP'] = pd.to_numeric(df['LP'].apply(remove).apply(removedot))
df['SP'] = pd.to_numeric(df['SP'].apply(remove).apply(removedot))

# Change closing date to datetime and train test split by closing date

In [9]:
#output train test dataset
df['Closing Date'] = pd.to_datetime(df['Closing Date'])
df = df.sort_values(by = 'Closing Date', ignore_index=True)
#df = df.drop(columns= 'Unnamed: 0')
train = df.iloc[0:7000]
test = df.iloc[7000:]
train.to_csv('train.csv')
test.to_csv('test.csv')

In [10]:
train

Unnamed: 0,Unit,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip
0,,17,769000,759000,CO,1257,2,2,Y,2.0,1999.0,341,,2019-01-02,19.0,Santa Teresa,95138
1,,51,649000,636000,CO,922,2,2,,0.0,1988.0,460,,2019-01-02,30.0,Campbell,95128
2,,7,750000,785000,TH,1655,3,2,Y,2.0,1982.0,445,,2019-01-02,36.0,Santa Teresa,95139
3,,15,674888,723000,DE,1408,3,3,,0.0,1953.0,,,2019-01-02,65.0,Alum Rock,95127
4,,7,549000,600000,TH,1234,2,1,Y,2.0,1981.0,284,,2019-01-02,37.0,Evergreen,95121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,416,4,849000,822000,CO,1622,2,2,Y,2.0,1998.0,726,M,2020-12-22,22.0,Central San Jose,95112
6996,,1,715000,766000,CO,1168,2,2,Y,2.0,1996.0,260,M,2020-12-22,24.0,South San Jose,95136
6997,,9,887999,894000,DE,1568,4,2,Y,2.0,1998.0,50,M,2020-12-22,22.0,South San Jose,95111
6998,8,6,889500,905000,CO,1464,2,2,,0.0,2013.0,210,M,2020-12-22,8.0,Berryessa,95132


In [11]:
test

Unnamed: 0,Unit,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip
7000,,6,799999,935000,DE,1398,4,2,Y,2.0,1971.0,,,2020-12-22,49.0,Evergreen,95148
7001,2,14,514000,505000,CO,903,2,1,Y,1.0,1970.0,379,M,2020-12-22,50.0,Cambrian,95118
7002,,25,549000,550000,CO,1094,2,2,,0.0,1991.0,463,M,2020-12-23,29.0,South San Jose,95122
7003,,10,949000,985000,DE,1065,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Blossom Valley,95123
7004,,14,900000,915000,DE,1439,3,2,,0.0,1920.0,,,2020-12-23,100.0,Central San Jose,95112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,56,460000,450000,CO,1240,2,2,,0.0,1968.0,822,M,2021-10-08,53.0,Evergreen,95135
9996,,7,549000,615000,CO,1008,2,1,Y,2.0,1987.0,320,M,2021-10-08,34.0,South San Jose,95111
9997,,0,650000,685000,CO,1452,3,2,Y,2.0,1986.0,380,,2021-10-08,35.0,Blossom Valley,95136
9998,,5,667053,675000,CO,1233,2,2,Y,1.0,1985.0,535,M,2021-10-08,36.0,Blossom Valley,95123


# Plot sell price with closing date

In [15]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

df['logSP'] = np.log(df['SP'])
fig = make_subplots(rows=1, cols=2,subplot_titles=("Selling price","Log Selling Price"))

fig.add_trace(
    go.Scatter(x=df['Closing Date'], y=df['SP'],mode = 'markers', name = 'Selling Price'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df['Closing Date'], y=df['logSP'],mode = 'markers', name = 'Log Selling Price'),
    row=1, col=2
)

fig.update_layout(height=400, width=1000)
fig.show()

# Extract month and week of month as feature

In [22]:
import warnings
warnings.filterwarnings('ignore')

train['month_of_date'] = train['Closing Date'].dt.month
test['month_of_date'] = test['Closing Date'].dt.month


In [23]:
test

Unnamed: 0,Unit,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date
7000,,6,799999,935000,DE,1398,4,2,Y,2.0,1971.0,,,2020-12-22,49.0,Evergreen,95148,12
7001,2,14,514000,505000,CO,903,2,1,Y,1.0,1970.0,379,M,2020-12-22,50.0,Cambrian,95118,12
7002,,25,549000,550000,CO,1094,2,2,,0.0,1991.0,463,M,2020-12-23,29.0,South San Jose,95122,12
7003,,10,949000,985000,DE,1065,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Blossom Valley,95123,12
7004,,14,900000,915000,DE,1439,3,2,,0.0,1920.0,,,2020-12-23,100.0,Central San Jose,95112,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,56,460000,450000,CO,1240,2,2,,0.0,1968.0,822,M,2021-10-08,53.0,Evergreen,95135,10
9996,,7,549000,615000,CO,1008,2,1,Y,2.0,1987.0,320,M,2021-10-08,34.0,South San Jose,95111,10
9997,,0,650000,685000,CO,1452,3,2,Y,2.0,1986.0,380,,2021-10-08,35.0,Blossom Valley,95136,10
9998,,5,667053,675000,CO,1233,2,2,Y,1.0,1985.0,535,M,2021-10-08,36.0,Blossom Valley,95123,10


In [28]:
import math

def add_week_of_month(df):
    df['week_in_month'] = pd.to_numeric(df['Closing Date'].dt.day/7)
    df['week_in_month'] = df['week_in_month'].apply(lambda x: math.ceil(x))
    return df

In [29]:
add_week_of_month(train)

Unnamed: 0,Unit,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date,week_in_month
0,,17,769000,759000,CO,1257,2,2,Y,2.0,1999.0,341,,2019-01-02,19.0,Santa Teresa,95138,1,1
1,,51,649000,636000,CO,922,2,2,,0.0,1988.0,460,,2019-01-02,30.0,Campbell,95128,1,1
2,,7,750000,785000,TH,1655,3,2,Y,2.0,1982.0,445,,2019-01-02,36.0,Santa Teresa,95139,1,1
3,,15,674888,723000,DE,1408,3,3,,0.0,1953.0,,,2019-01-02,65.0,Alum Rock,95127,1,1
4,,7,549000,600000,TH,1234,2,1,Y,2.0,1981.0,284,,2019-01-02,37.0,Evergreen,95121,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,416,4,849000,822000,CO,1622,2,2,Y,2.0,1998.0,726,M,2020-12-22,22.0,Central San Jose,95112,12,4
6996,,1,715000,766000,CO,1168,2,2,Y,2.0,1996.0,260,M,2020-12-22,24.0,South San Jose,95136,12,4
6997,,9,887999,894000,DE,1568,4,2,Y,2.0,1998.0,50,M,2020-12-22,22.0,South San Jose,95111,12,4
6998,8,6,889500,905000,CO,1464,2,2,,0.0,2013.0,210,M,2020-12-22,8.0,Berryessa,95132,12,4


In [30]:
add_week_of_month(test)

Unnamed: 0,Unit,DOM,LP,SP,BT,SqFt,BR,Bth,Gar,GarSp,YrBlt,HOA Fee,Freq,Closing Date,Age,Area,Zip,month_of_date,week_in_month
7000,,6,799999,935000,DE,1398,4,2,Y,2.0,1971.0,,,2020-12-22,49.0,Evergreen,95148,12,4
7001,2,14,514000,505000,CO,903,2,1,Y,1.0,1970.0,379,M,2020-12-22,50.0,Cambrian,95118,12,4
7002,,25,549000,550000,CO,1094,2,2,,0.0,1991.0,463,M,2020-12-23,29.0,South San Jose,95122,12,4
7003,,10,949000,985000,DE,1065,3,2,Y,2.0,1971.0,,,2020-12-23,49.0,Blossom Valley,95123,12,4
7004,,14,900000,915000,DE,1439,3,2,,0.0,1920.0,,,2020-12-23,100.0,Central San Jose,95112,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,56,460000,450000,CO,1240,2,2,,0.0,1968.0,822,M,2021-10-08,53.0,Evergreen,95135,10,2
9996,,7,549000,615000,CO,1008,2,1,Y,2.0,1987.0,320,M,2021-10-08,34.0,South San Jose,95111,10,2
9997,,0,650000,685000,CO,1452,3,2,Y,2.0,1986.0,380,,2021-10-08,35.0,Blossom Valley,95136,10,2
9998,,5,667053,675000,CO,1233,2,2,Y,1.0,1985.0,535,M,2021-10-08,36.0,Blossom Valley,95123,10,2
