## Generating features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import geopandas as gpd

In [None]:
df = pd.read_csv("../data/interim/gazatteer_combined.csv")

In [None]:
df.columns

In [None]:
df = gpd.GeoDataFrame(df,
                       geometry=gpd.points_from_xy(df.x_coordinate,
                                                   df.y_coordinate))

We want to generate features relating to these factors:
* Property structure
* Property tenure
* Number of residents
* Turnover of residents
* Evidence of a negligent landlord
* Evidence of housing pressure

## Property Tenure

In [None]:
df[['date_lr', 'lodgement_date_epc', 'transaction_type_epc', 'social_housing']].head(20)

In [None]:
df = df.assign(date_lr = pd.to_datetime(df.date_lr),
              lodgement_date_epc = pd.to_datetime(df.lodgement_date_epc))

In [None]:
df['tenure'] = 'Unknown'
df.loc[((df.lodgement_date_epc > df.date_lr) |
        ((pd.isna(df.date_lr)) & (~pd.isna(df.lodgement_date_epc))))
       & (df.transaction_type_epc.str.contains("rental|sale")), 'tenure'] = (
    df.loc[((df.lodgement_date_epc > df.date_lr) |
        ((pd.isna(df.date_lr)) & (~pd.isna(df.lodgement_date_epc))))
       & (df.transaction_type_epc.str.contains("rental|sale")), 'transaction_type_epc'])
df.loc[((df.lodgement_date_epc < df.date_lr) |
        ((~pd.isna(df.date_lr)) & (pd.isna(df.lodgement_date_epc)))), 'tenure'] = "sale"
df.loc[(df.social_housing) & (df.tenure == "Unknown"), 'tenure'] = 'Social Rent'

In [None]:
df[['date_lr', 'lodgement_date_epc', 'transaction_type_epc', 'social_housing', 'tenure']].head(20)

In [None]:
df.tenure = (df.tenure
             .replace("rental", "Private Rent")
             .replace("rental (private)", "Private Rent")
             .str.replace('rental.*social.*', "Social Rent")
             .str.replace(".*sale.*", "Owner Occupied"))

In [None]:
df[['date_lr', 'lodgement_date_epc', 'transaction_type_epc', 'social_housing', 'tenure']].head(20)

Other borough held data to help determine tenure could include:
* Tenancy Deposit data
* Further social housing data
* Right to buy data

## Property Structure
* Size
* Age
* Type



In [None]:
df.columns[df.columns.str.endswith("ukb")]

In [None]:
df[['dwelling_type_text_ukb', 'property_type_epc', 'built_form_epc', 'tertiary_desc']]

In [None]:
df['building_type'] = 'Unknown'
df.loc[(~pd.isna(df.built_form_epc)) & (df.built_form_epc != "NO DATA!"), 'building_type'] = df.loc[(~pd.isna(df.built_form_epc)) & (df.built_form_epc != "NO DATA!"), 'built_form_epc'] 
df.loc[(~pd.isna(df.dwelling_type_text_ukb)) & (df.building_type == "Unknown"), 'building_type'] = df.loc[(~pd.isna(df.dwelling_type_text_ukb)) & (df.building_type == "Unknown"), 'dwelling_type_text_ukb']
df.loc[(df.building_type == "Unknown"), 'building_type'] = df.loc[(df.building_type == "Unknown"), 'tertiary_desc']

df.building_type.value_counts()

In [None]:
df.building_type = (df.building_type
                    .str.lower()
                    .str.replace(".*flat.*", "flat", case=False)
                    .str.replace(".*terrace.*", "terrace", case=False))

df.loc[~df.building_type.isin(['terrace', 'flat', 'semi-detached', 'detached']), 'building_type'] = 'other'
df.building_type.value_counts()

In [None]:
df.loc[(df.property_type_epc.isin(['Flat', 'Maisonette'])) | (df.building_type == "flat") | (df.dwelling_type_text_ukb.str.contains("flat", case=False)), 'flat'] = True
df.flat = df.flat.fillna(False)
pd.crosstab(df.building_type, df.flat)

In [None]:
df = df.assign(ukb_rooms = df.bedroom_number_ukb + df.wet_room_number_ukb + df.reception_number_ukb)

In [None]:
pd.crosstab(df.ukb_rooms, df.number_habitable_rooms_epc)

In [None]:
df = df.assign(bedrooms = df.bedroom_number_ukb,
              rooms = df.ukb_rooms)

In [None]:
df[['building__age_text_ukb', 'new_build_lr', 'date_lr', 'construction_age_band_epc']]

In [None]:
df.building__age_text_ukb.unique()

In [None]:
df = df.assign(ukb_min = pd.to_numeric(df.building__age_text_ukb.str.extract(" ([0-9]{4})")[0]),
              ukb_max = pd.to_numeric(df.building__age_text_ukb.str.extract("-([0-9]{4})")[0]))
df.loc[pd.isna(df.ukb_max) & (~pd.isna(df.ukb_min)), 'ukb_max'] = datetime.today().year
df[['building__age_text_ukb', 'ukb_min', 'ukb_max']].drop_duplicates()

In [None]:
df.construction_age_band_epc.unique()

In [None]:
df = df.assign(epc_min = pd.to_numeric(df.construction_age_band_epc.str.extract(" ([0-9]{4})")[0]),
              epc_max = pd.to_numeric(df.construction_age_band_epc.str.extract("-([0-9]{4})")[0]))
df.loc[(df.epc_min == 1900) & (pd.isna(df.epc_max)), ['epc_min', 'epc_max']] = [1800, 1900]
df.loc[(df.epc_min == 2007) & (pd.isna(df.epc_max)), 'epc_max'] = datetime.today().year
df[['construction_age_band_epc', 'epc_min', 'epc_max']].drop_duplicates()

In [None]:
df = df.assign(lr_min = df.date_lr.dt.year,
              lr_max = df.date_lr.dt.year)
df[['date_lr', 'lr_min', 'lr_max']].drop_duplicates().head()
df.loc[df.new_build_lr != "Y", ['lr_min', 'lr_max']] = [pd.np.NAN, pd.np.NAN]

In [None]:
df['build_age_min'] = df[['epc_min', 'lr_min', 'ukb_min']].max(axis = 1)
df['build_age_max'] = df[['epc_max', 'lr_max', 'ukb_max']].min(axis = 1)
#df.plot(x='build_age_min', y='build_age_max', kind='scatter')

In [None]:
df[['build_age_min', 'build_age_max']].plot(x='build_age_min', y='build_age_max', kind='scatter')

In [None]:
df[['build_age_min', 'build_age_max']]

In [None]:
df.loc[(df.build_age_min > df.build_age_max), ['build_age_min', 'build_age_max']] = [pd.np.NAN, pd.np.NAN]
for ds in ['lr', 'ukb', 'epc']:
    for tp in ['min', 'max']:
        df[f'build_age_{tp}'] = df[f'build_age_{tp}'].combine_first(df[f'{ds}_{tp}'])
df['build_age'] = df[['build_age_min', 'build_age_max']].mean(axis=1)

In [None]:
f, ax = plt.subplots(1)
ax = df.plot(ax=ax, markersize=0.25, column='build_age')
ax.set_axis_off()
plt.show()

In [None]:
df[['build_age']].plot(y='build_age', kind='hist')

## Number of tenants

In [None]:
census_cols = df.columns[df.columns.str.endswith("_census") & ~df.columns.str.startswith("merge_")]

In [None]:
(census_cols[census_cols.str.startswith("household_composition")]
 .str.replace("household_composition_", "")
 .str.replace("_census", "")
 .str.replace("_measures_value", ""))

In [None]:
df['other_households_3bed_plus'] = df[df.columns[df.columns.str.startswith("household_composition") & df.columns.str.contains("other_household") & df.columns.str.contains("3|4")]].sum(axis=1)

In [None]:
f, ax = plt.subplots(1)
ax = df.plot(ax=ax, markersize=0.25, column='other_households_3bed_plus')
ax.set_axis_off()
plt.show()

## Neglectful Landlord

In [None]:
df.loc[(df.energy_consumption_potential_epc <= 0) |(df.energy_consumption_current_epc <= 0),
       ['energy_consumption_current_epc', 'energy_consumption_potential_epc']] = [pd.np.NAN, pd.np.NAN]

df = df.assign(energy_eff_def = df.energy_consumption_current_epc / df.energy_consumption_potential_epc)

## Housing Pressure

* Crime
* Airbnb
* IMD

In [None]:
df = df[['uprn', 'geo_address', 'postcode', 'tenure', 'social_housing', 'building_type', 'flat', 'bedrooms', 'rooms',
        'build_age', 'other_households_3bed_plus', 'energy_eff_def', 'asb_sum_crime', 'price_pp_median_abnb',
         'imd_decile_imd', 'hmo']].rename(columns={'imd_decile_imd': 'imd_decile'})
df.to_csv("../data/interim/features.csv", index=False)