<a href="https://colab.research.google.com/github/JRKagumba/Capstone_2/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0 Import Libraries and Data

In [None]:
# import libraries 
import pandas as pd # Import Pandas for data manipulation using dataframes
import numpy as np # Import Numpy for data statistical analysis 
import matplotlib.pyplot as plt # Import matplotlib for data visualisation

import random
import seaborn as sns

import torch
import torch.nn as nn

import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# This relates to plotting datetime values with matplotlib:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

realestate_df = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/RealEstate/data_with_locations_and_ids.csv")
print(f"Length before droping duplicate ids = {len(realestate_df)}")
realestate_df =realestate_df.drop_duplicates(subset=['id'], keep='first')
print(f"Length after droping duplicate ids = {len(realestate_df)}")

MessageError: ignored

In [None]:
sample_df=realestate_df.sample(frac=0.95, replace=False, random_state=99)
sample_df["Price"] = pd.to_numeric(sample_df["Price"], downcast="float")
sample_df['Time_Posted'] = sample_df['Time_Posted'].values.astype('datetime64[ns]')
sample_df.head(3)

In [None]:
print(f"Length before removing outliers = {len(sample_df)}")

listings=sample_df['Listing_Type'].unique()
dataframe_list=[]

for value in listings:
    df = sample_df[sample_df.Listing_Type==value]
    q_low = df["Price"].quantile(0.05)
    q_hi  = df["Price"].quantile(0.95)
    df = df[(df["Price"] < q_hi) & (df["Price"] > q_low)]
    
    dataframe_list.append(df)
    
sample_df = pd.concat(dataframe_list, axis=0, ignore_index=True) 

print(f"Length after removing outliers = {len(sample_df)}")

In [None]:
listing_type_dict={}

listing_types=sample_df['Listing_Type'].unique()

for type in listing_types:
    listing_type_dict[type]=sample_df[sample_df['Listing_Type']==type]

print(listing_type_dict.keys())

# 1 EDA

In [None]:
fig = make_subplots(rows=len(listing_types),subplot_titles=[listing for listing in listing_types])

for index, listing in enumerate(listing_types):


    df=listing_type_dict[listing][['Time_Posted', 'Price']].groupby(pd.Grouper(key="Time_Posted", freq="1W")).agg({'Price': ['median', 'mean']}).dropna()
    df.columns=df.columns.map('_'.join)
    df=df.reset_index()

    data1=go.Scatter(x=df['Time_Posted'], y=df['Price_median'],
                        mode='lines',
                        name='median',
                        line=dict(color='#abd7eb'))
    data2=go.Scatter(x=df['Time_Posted'], y=df['Price_mean'],
                        mode='lines',
                        name='mean',
                        line=dict(color='#F47174'))

    fig.add_traces([data1,data2],rows=(index+1),cols=1)




fig.update_layout(title_text=f"Listing Price and Time", height=2000)
fig.show()

In [None]:
print('Listing Types')
print(sample_df['Listing_Type'].unique())
print()
print('Sub Regions')
print(sample_df['Sub_Region'].value_counts().head(20).index)

In [None]:
_specify= (sample_df['Listing_Type']=='Apartments-Condos') & (sample_df['Sub_Region']=='Toronto')

In [None]:
df=sample_df[_specify][['Time_Posted', 'Price']].groupby(pd.Grouper(key="Time_Posted", freq="1W")).agg({'Price': ['median', 'mean']}).dropna()
df.columns=df.columns.map('_'.join)
mean_df=df[['Price_mean']]
median_df=df[['Price_median']]

df.head()

In [None]:
df.plot(figsize=(24,4))

# 2 Setting Up for Modeling

In [None]:
y=df['Price_median'].values

test_size=4
train_set=y[:-test_size]
test_set=y[-test_size:]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(train_set.reshape(-1,1))
train_norm=scaler.transform(train_set.reshape(-1,1))
train_norm

In [None]:
train_norm = torch.FloatTensor(train_norm).view(-1)
train_norm

In [None]:
window_size=4

# Define function to create seq/label tuples
def input_data(_sequence,_windowsize):
    out = []
    L = len(_sequence)

    for i in range(L-_windowsize):
        window = _sequence[i:i+_windowsize]
        label = _sequence[i+_windowsize:i+_windowsize+1]
        out.append((window,label))
    return out

# Apply the input_data function to train_norm
train_data = input_data(train_norm,window_size)
len(train_data)  # this should equal len(original data)-len(test size)-len(window size)

# 3 Define Model

In [None]:
class LSTMnetwork(nn.Module):
    def __init__(self,input_size=1,hidden_size=100,output_size=1):
        super().__init__()
        self.hidden_size = hidden_size
        
        # Add an LSTM layer:
        self.lstm = nn.LSTM(input_size,hidden_size)
        
        # Add a fully-connected layer:
        self.linear = nn.Linear(hidden_size,output_size)
        
        # Initialize h0 and c0:
        self.hidden = (torch.zeros(1,1,self.hidden_size),
                       torch.zeros(1,1,self.hidden_size))

    def forward(self,seq):
        lstm_out, self.hidden = self.lstm(
            seq.view(len(seq),1,-1), self.hidden)
        pred = self.linear(lstm_out.view(len(seq),-1))
        return pred[-1]  # we only want the last value

In [None]:
torch.manual_seed(101)
model = LSTMnetwork()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model

In [None]:
epochs = 20000

import time
start_time = time.time()

for epoch in range(epochs):
    
    # extract the sequence & label from the training data
    for seq, y_train in train_data:
        
        # reset the parameters and hidden states
        optimizer.zero_grad()
        model.hidden = (torch.zeros(1,1,model.hidden_size),
                        torch.zeros(1,1,model.hidden_size))
        
        y_pred = model(seq)
        
        loss = criterion(y_pred, y_train)
        loss.backward()
        optimizer.step()
    
 
    if epoch%1000 == 1:
        
    # print training result
      print(f'Epoch: {epoch+1:2} Loss: {loss.item():10.8f}')
    
print(f'\nDuration: {time.time() - start_time:.0f} seconds')

# 4 Compare with Test Set

In [None]:
future = 12

# Add the last window of training values to the list of predictions
preds = train_norm[-window_size:].tolist()

# Set the model to evaluation mode
model.eval()

for i in range(future):
    seq = torch.FloatTensor(preds[-window_size:])
    with torch.no_grad():
        model.hidden = (torch.zeros(1,1,model.hidden_size),
                        torch.zeros(1,1,model.hidden_size))
        preds.append(model(seq).item())


preds[window_size:] 


In [None]:
true_predictions = scaler.inverse_transform(np.array(preds[window_size:]).reshape(-1, 1))
true_predictions

In [None]:
df['Price_median'][-12:]

In [None]:
time_change = df.index[-1]-df.index[-2]
time_change

In [None]:
min_range=df['Price_mean'][-12:].index.min()
max_range=df['Price_mean'][-12:].index.max()

x = np.arange(min_range, max_range+time_change, dtype='datetime64[W]').astype('datetime64[D]')
x

In [None]:
plt.figure(figsize=(24,4))
plt.title('Listing Price')
plt.ylabel('Price')
plt.grid(True)
plt.autoscale(axis='x',tight=True)
plt.plot(df['Price_median'])
plt.plot(x,true_predictions)
plt.show()

In [None]:
x.shape

# 5 Forecast into unknown future

In [None]:
epochs = 20000

# set model back to training mode
model.train()

# feature scale the entire dataset
y_norm = scaler.fit_transform(y.reshape(-1, 1))
y_norm = torch.FloatTensor(y_norm).view(-1)
all_data = input_data(y_norm,window_size)

import time
start_time = time.time()

for epoch in range(epochs):
    
    # train on the full set of sequences
    for seq, y_train in all_data:  
        
        # reset the parameters and hidden states
        optimizer.zero_grad()
        model.hidden = (torch.zeros(1,1,model.hidden_size),
                        torch.zeros(1,1,model.hidden_size))
        
        y_pred = model(seq)
        
        loss = criterion(y_pred, y_train)
        loss.backward()
        optimizer.step()

    if epoch%500 == 1:  
    # print training result
      print(f'Epoch: {epoch+1:2} Loss: {loss.item():10.8f}')
    
print(f'\nDuration: {time.time() - start_time:.0f} seconds')

In [None]:
window_size = 12
future = 12
L = len(y)

preds = y_norm[-window_size:].tolist()

model.eval()
for i in range(future):  
    seq = torch.FloatTensor(preds[-window_size:])
    with torch.no_grad():
        # Reset the hidden parameters here!
        model.hidden = (torch.zeros(1,1,model.hidden_size),
                        torch.zeros(1,1,model.hidden_size))  
        preds.append(model(seq).item())

# Inverse-normalize the prediction set
true_predictions = scaler.inverse_transform(np.array(preds).reshape(-1, 1))

In [None]:
true_predictions.shape

In [None]:
min_range=df['Price_mean'][-window_size:].index.min()
max_range=df['Price_mean'][-window_size:].index.max()

time_change = df.index[-1]-df.index[-2]
time_change

In [None]:
future_min_range=max_range+time_change
future_max_range=max_range+(time_change*(window_size+1))

In [None]:
x = np.arange(future_min_range, future_max_range, dtype='datetime64[W]').astype('datetime64[D]')
x.shape

In [None]:
# PLOT THE RESULT
# Set a data range for the predicted data.
# Remember that the stop date has to be later than the last predicted value.



plt.figure(figsize=(24,4))
plt.title('Listing Price')
plt.ylabel('Price')
plt.grid(True)
plt.autoscale(axis='x',tight=True)
plt.plot(df['Price_median'])
plt.plot(x,true_predictions[window_size:])
plt.show()