Matej Buršík

In [1]:
# Import
import pyarrow
import pandas as pd

In [None]:
# Load and short information about the columns
df = pd.read_parquet('data/price_paid_records.parquet', engine='pyarrow')
df.info()

# Cleaning
Contains all the data cleaning performed in the individual EDA notebook

### initial_cleaning.ipynb

In [None]:
# Creating new column names based on the old ones
new_columns = [col.lower().replace(' ', '_').replace('/', '_') for col in df.columns]
new_columns = ['city' if col=='town_city' else col for col in new_columns]
new_columns = ['is_new' if col=='old_new' else col for col in new_columns]
df.columns = new_columns

In [None]:
# Dropping unnecessery columns
df.drop('transaction_unique_identifier', axis=1, inplace=True)
df.drop('record_status_-_monthly_file_only', axis=1, inplace=True)

In [None]:
# Changing the data types of columns
df['date_of_transfer'] = pd.to_datetime(df['date_of_transfer'])
df['is_new'] = df['is_new'].map({'Y': True, 'N': False})

### outliers.ipynb

In [None]:
# Removing the 'Unknown' duration type
df = df[~(df['duration'] == 'U')]

### price.ipynb

In [None]:
# Remove unrealistically low prices
df = df[df['price'] >= 85]

In [None]:
# Removing outliears based on correlation to Property types
df = df[
    ((df['price'] <= 350000) & (df['property_type'] == 'T')) | 
    ((df['price'] <= 375000) & (df['property_type'] == 'S')) |
    ((df['price'] <= 550000) & (df['property_type'] == 'D')) |
    ((df['price'] <= 400000) & (df['property_type'] == 'F')) |
    ((df['price'] <= 800000) & (df['property_type'] == 'O'))
]

In [None]:
# Removing outliears based on correlation to New/Old property
df = df[
    ((df['price'] <= 500000) & (df['is_new'] == True)) | 
    (df['is_new'] == False)
]

# Saving data into files
Creating dataset for specialized use cases or experimentation
- final_all_data - contains all the cleaned data no-matter the date
- after_2008_crisis - contains all the cleaned data after 1st of January 2009
- after_2016_policy - contains all the cleaned data after 1st of April 2016
- london_terraced_forcasting - contains cleaned data about a specific location and property type, no-matter the date, for price forcasting

In [None]:
# Save the final data file
df = df.reset_index(drop=True)
df.to_parquet('data/final_all_data.parquet', engine='pyarrow')

### date_of_transfer.ipynb

In [None]:
# Dataset after 2008 DFC
cutoff_date = pd.to_datetime('2009-01-01')
df_2008 = df[df['date_of_transfer'] >= cutoff_date]
df_2008 = df_2008.reset_index(drop=True)

# Save data in parquet format
print(df_2008.info())
df_2008.to_parquet('data/after_2008_crisis.parquet', engine='pyarrow')

In [None]:
# Dataset after April 2016 policy
cutoff_date = pd.to_datetime('2016-04-01')
df_2016 = df[df['date_of_transfer'] >= cutoff_date]
df_2016 = df_2016.reset_index(drop=True)

# Save data in parquet format
print(df_2016.info())
df_2016.to_parquet('data/after_2016_policy.parquet', engine='pyarrow')

### forecasting_data.ipynb

In [None]:
# Get only data for city London and property type Terraced (T)
london_terraced_df = df[
    ((df['city'] == 'LONDON') & (df['property_type'] == 'T'))
]

In [None]:
# Getting only necessery columns
london_terraced_df = london_terraced_df[['date_of_transfer', 'price']]

In [None]:
london_terraced_df = london_terraced_df.reset_index(drop=True)
london_terraced_df.info()
london_terraced_df.to_parquet('data/london_terraced_forecasting.parquet', engine='pyarrow')