In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import subprocess
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
from datetime import datetime
import matplotlib.pyplot as plt  
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import sys
from IPython.core.interactiveshell import InteractiveShell
import warnings

In [2]:
# Notebook setting updates
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Adjust options for displaying the float columns
pd.options.display.float_format = '{:,.2f}'.format

# Warning settings

warnings.filterwarnings(action='ignore')

In [3]:
txt_file = ".../BERPublicsearch.txt"

In [4]:
#Method 1

# Initialize an empty list to store data
data = []

# Read the data line by line
with open(txt_file, 'r', encoding='latin1') as file:
    for line in file:
        # Split the line by the tab delimiter
        fields = line.strip().split('\t')
        # Append the fields to the data list
        data.append(fields)

In [5]:
# Had to split out the list of list. First element column names
column_names = data[0]
data_input = data[1:]

In [6]:
# Convert the list of data into a DataFrame
SEAI_data = pd.DataFrame(data=data_input, columns=column_names)

In [None]:
# Overview of the dimensions
SEAI_data.shape
SEAI_data.head()
SEAI_data.columns
SEAI_data.dtypes

In [None]:
# Understand the missing values by column
SEAI_data.Year_of_Construction.isnull().sum()

# Create method to review the proportion of missing values by each column
def missing_columns(df):
    for col in df.columns:
        miss = df.isnull().sum()
        miss_per = miss / len(df)
    return miss_per

missing_columns(SEAI_data[['CountyName', 'Year_of_Construction']])


In [None]:
SEAI_data.Year_of_Construction.describe(include='all')

In [None]:
SEAI_data.groupby('Year_of_Construction')['BerRating'].count()

In [11]:
# Function used to clean imported data.
# .assign : steps used to update variables

# Calculate the current year
current_year = datetime.now().year

def tweak_jb(df):
    return (
        df
        .loc[:,['BerRating', 'CountyName', 'SA_Code', 'DwellingTypeDescr', 'Year_of_Construction']]
        .assign(Year_of_Construction=lambda df_:df_.Year_of_Construction.astype(int),
                Age_dwelling=lambda df_:current_year - df_.Year_of_Construction,
               )
    )

In [None]:
data_p1 = tweak_jb(SEAI_data)
data_p1.head()

In [13]:
def sampling_data(input_data, sample_perc, stratification=None, seed=None):
    np.random.seed(seed)
    if stratification is not None:
        data = input_data.groupby(stratification, group_keys=False).apply(lambda x: x.sample(frac=sample_perc))
    else:
        data = input_data.sample(frac=sample_perc)
    return data


In [None]:
# Read the County_NUTS3_OTHER.csv file
country_nuts3 = pd.read_csv(".../County_NUTS3_OTHER.csv")

# Merge the reference data into the main table
data_p2 = (
    pd
    .merge(data_p1, country_nuts3, left_on='CountyName', right_on='Factor_Level', how='left')
    .drop(columns=['Factor', 'Factor_Level'])
    .rename(columns={'group_id': 'CountyName_new'})
    .assign(BerRating=lambda df_:df_.BerRating.astype('float'),
            CountyName=lambda df_:df_.CountyName.astype('category'),
            SA_Code=lambda df_:df_.SA_Code.astype('category'),
            DwellingTypeDescr=lambda df_:df_.DwellingTypeDescr.astype('category'),
            CountyName_new=lambda df_:df_.CountyName_new.astype('category'),
            Age_dwelling2=lambda df_:49.53 * df_.Age_dwelling ** 0.391,
           )
)

data_p2.head()

In [None]:
data_p2.dtypes

In [15]:
# Add an ID column to data_p1
data_p2['ID'] = range(1, len(data_p2) + 1)

# Sampling data
training_data = sampling_data(data_p2, 0.80, "BerRating", seed=256)

# Creating validation dataset
validation_data = data_p2[~data_p2['ID'].isin(training_data['ID'])]

# Drop the 'ID' column from the datasets if it's not needed further
# training_data.drop('ID', axis=1, inplace=True)
# validation_data.drop('ID', axis=1, inplace=True)

In [None]:
training_data.shape
validation_data.shape
training_data.head()
validation_data.head()