# Multi Linear Regression Model

In [None]:
# Libraries for data loading, data manipulation and data visulisation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
sns.set()

# Libraries for data preparation and model building

import statsmodels.api as sm
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

import sys
# Insert the parent path relative to this notebook so we can import from the src folder.
sys.path.insert(0, "..")
from src.data.make_dataset import split_data
from src.data.make_dataset import reg_metrics


# Import modules for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

import boto3

pd.set_option('display.max_columns', None)
pd.reset_option("display.max_rows", 20)

In [None]:
# Instantiate boto3 by providing access and secrete keys
client = boto3.client('s3', aws_access_key_id='AKIATNJHRXAPUA4DIFER', aws_secret_access_key="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")

In [None]:
# S3 bucket name
bucket = "2207-17-fibre-competitive-intensity-model-b"

In [None]:
# Generate a file path to S3 bucket
uptake_file_path = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Data+for+Modeling/ward-data-cleaned.csv'
admin_bound_path = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Preprocessed+Data/admin_boundaries_2011.csv'
mun_uptake_rate = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Preprocessed+Data/SA-municipality-uptake-rate.csv'

In [None]:
# Load the dataset
data = pd.read_csv(uptake_file_path)
admin_bound = pd.read_csv(admin_bound_path)
mun_rate = pd.read_csv(mun_uptake_rate)

## Exploratory Data Analysis

In [None]:
# Preview the top 5 rows of the dataset
data.head()

In [None]:
# Preview the admin level dataset
admin_bound.head()

In [None]:
# Drop the municipality and province in the data dataset
data = data.drop(['MUNICNAME','PROVINCE','WARD_ID'], axis=1)

In [None]:
# Combine the admin and the data dataset
data = pd.merge(admin_bound, data, left_on=['WARD_ID'], right_on=['ward_code'])

In [None]:
# Check for correlation between target variable and the features
data.corr(numeric_only=True)['uptake_rate_hh'].sort_values(ascending=False)

Quite a number of the features have strong correlation with the target variable. This is a strong indication that a linear model may perform well with this dataset.

In [None]:
# Check for multicolinearity
plt.figure(figsize=(25,15))
sns.heatmap(data.corr(numeric_only=True),
            vmin = -1, 
            vmax = 1,
            fmt=".1f",
            cmap ="GnBu",
            annot=True)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

Multi colinearity can be observed among different features.

In [None]:
# Check for missing values
data.isnull().sum().sum()

There is are no missing values

In [None]:
# Check the distribution of the target variable
sns.histplot(data['uptake_rate_pop'], kde=True)
plt.ylim(0,300)
plt.show()

In [None]:
print("skewness:", data['uptake_rate_hh'].skew())
print("Kurtosis:", data['uptake_rate_hh'].kurtosis())

The target variable is a right-skewed distribution  with a skewness of 4.0 and kutosis of 30, giving us a leptokurtic distribution. All these affirms that the target variable is not normally distributed, and It would not be wise to develop a linear model with such data.

In [None]:
# Check the data types of the dataset
data.dtypes

## Data Preprocessing

In [None]:
# Make a copy of the dataset
df = data.copy()

In [None]:
# # Create a new feature for location and assign a value of zero to it
df['Location'] = 0

# # Western Cape and Guateng are areas with relatively high fiber uptake rate. We will assing them with a value of 1 for location
df.loc[((df['PROVINCE'] == 'Western Cape') | (df['PROVINCE'] == 'Gauteng')), 'Location'] = 1

In [None]:
# Extract data points with uptake rates only
df = df[df['uptake_rate_hh'] != 0]

In [None]:
# Drop columns that are not required for model development
df = df.drop(['ward', 'ward_code','avg_d_kbps','avg_u_kbps','avg_lat_ms','fiber','devices','total_tiles',
                'uptake_rate/population','uptake_rate/households','total_income','population','households','MUNICNAME',
                'WARDNO', 'geography','PROVINCE','Unnamed: 0','WARD_POP','Area'], axis=1)

In [None]:
# Set municipality as index
df = df.set_index('WARD_ID')

In [None]:
# Normalize the target variable
df['uptake_rate_hh_norm'] = np.log(df['uptake_rate_hh'])
df['uptake_rate_pop_norm'] = np.log(df['uptake_rate_pop'])

In [None]:
# Plot the distribution of the fiber speed test
fig, axes = plt.subplots(1, 2, figsize =(15,3))

sns.histplot(df['uptake_rate_hh'], kde=True, ax = axes[0])
axes[0].set_title("Uptake_rate")

sns.histplot(df['uptake_rate_hh_norm'], kde=True, ax = axes[1])
axes[1].set_title("Normalized_uptake_rate")


plt.show()

## Check for OLS Assumptions

### OLS Assumptions include:

- Linearity
- No Endogeneity
- Normality and Homoscedasticity
- No Autocorrelation
- No Multicolinearity

In [None]:
# def check_linearity(df):
#     '''
#     This function takes in a dataframe and return a scatter plot visual of the first three columns
#     of the dataframe's features against the dependent variable.
    
#     '''
#     columns = df.columns
    
#     f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3)) #sharey -> share 'Price' as y
#     ax1.scatter(df[columns[0]], df['uptake_rate_hh'])
#     ax1.set_title(f'load_shortfall_3h and {columns[0]}')
# #     ax2.scatter(df[columns[1]], df['uptake_rate_hh'])
# #     ax2.set_title(f'load_shortfall_3h and {columns[1]}')
# #     ax3.scatter(df[columns[2]], df['uptake_rate_hh'])
# #     ax3.set_title(f'load_shortfall_3h and {columns[2]}')
    
#     return plt.show()

In [None]:
features = df.columns

for i in features:
    sns.histplot(df[i], kde=True)
    plt.show()

As seen from the plots, some of the features needs to be adjusted for outliers to avoid misleading the model

### No Endogeneity of the Regressors Assumption

- Endgeneity occurs when there is Ommited Variable Bias, therfore making our independendent variables to be correlated with the Residuals. Relaxing this assumption can be very difficult.
>
- We could explore more on this assumption if Multilinear regression is fit for our model.

### Normality and Homoscedasticity Assumption

- Normality is assumed for a big sample using central limit theorem
- Zero mean of the distribution of errors is accomplished with the inclusion of intercept in the regression
- Homoscedasticity means equal varience among the error terms.

In this case, we know that the target variable is not normally distributed. But the Normalization done earlier is suficient for now.

### No Autocorrelation Assumption

- Autocorrelation assumes that errors are uncorrelated
- This is hard to observe in a data that is taken one moment of a time.
- it is very common with time series data
- Since our data is not a time series data, we do expect to encounter autocorrelation problems. However, there is a need to verify this

One way to check for autocorrelation is through the Durbin-Watson test, and furtunately, we can easily get this result using Statsmodels, on the summary stats of our trained model.

In [None]:
# Use statsmodel to fit the data so as to print the summary of the model stats

# Declare the target
y = df['uptake_rate_hh_norm']

# Declare the features
x1 = df.drop(['uptake_rate_pop','uptake_rate_hh','uptake_rate_hh_norm','uptake_rate_pop_norm'], axis = 1)
x2 = x1[['higher_education','percent_employed','average_income','percent_unemployed','percent_more_15_less_25']]


scalar_OLS = StandardScaler()
x2_scale = scalar_OLS.fit_transform(x2)
x2 = pd.DataFrame(x2_scale, columns=x2.columns, index = x2.index)
x = sm.add_constant(x2)
result = sm.OLS(y, x).fit()
print(result.summary())

The Durbin-Watson score is approximately 2, this shows that there is no Autocorrelation in the dataset. We expected this since the dataset is not a time-series data.

### No Multicollinearity Assumption

- This can be verified by computing the VIF

#### Variance Inflation Factor

sklearn does not have a built-in way to check for multicollinearity
one of the main reasons is that this is an issue well covered in statistical frameworks and not in ML ones
So, to calculate VIF, we have to rely on statsmodels
To make this as easy as possible to use, we declare a variable where we put
all features where we want to check for multicollinearity

In [None]:
# since all our data are numerical, we simply calculate our VIF
variables = x1

# we create a new data frame which will include all the VIFs
# note that each variable has its own variance inflation factor as this measure is variable specific (not model specific)
vif = pd.DataFrame()

# here we make use of the variance_inflation_factor, which will basically output the respective VIFs 
vif["VIF"] = [variance_inflation_factor(x1.values, i) for i in range(variables.shape[1])]
# Finally, I like to include names so it is easier to explore the result
vif["Features"] = variables.columns

From the VIF, all features are multi collinear if the team is use the rule of thumb threshold of 10.

## Declare the Inputs and Target Variables

In [None]:
# Declare the targets and the inputs
# The dependent variable is uptake_rate_hh_norm for parametric models
target = df['uptake_rate_hh_norm']

# The dependent variable is uptake_rate_hh for non parametric models
target_nonp = df['uptake_rate_hh']

# The inputs is everything BUT the dependent variables, so we can simply drop it
inputs = df.drop(['uptake_rate_pop','uptake_rate_hh','uptake_rate_hh_norm', 'uptake_rate_pop_norm'], axis = 1)

## Separate Test dataset from the Train dataset

In [None]:
# Separate the testing data from the training data. We will use a 20% split here
# split_value = int(len(df) * 0.2)

# # Split the input and target data into teat and train components
# input_test = inputs.iloc[-split_value:, :]
# target_test = target.iloc[-split_value:]
# target_test_nonp = target_nonp.iloc[-split_value:]

# input_train = inputs.iloc[:-split_value, :]
# target_train = target.iloc[:-split_value]
# target_train_nonp = target_nonp.iloc[:-split_value]

In [None]:
# Split the data into training and testing set
# For parametric models
x_train, x_test, y_train, y_test = split_data(inputs, target, 0.2, 365)
x_train, x_test, y_train_nonp, y_test_nonp = split_data(inputs, target_nonp, 0.2, 365)

## Scale the Dataset

In [None]:
# Regularize the data by feature scaling
# Create a scaler object
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index = x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index = x_test.index)

## Model Development

### Multi Linear Regression Algorithm

In [None]:
# Create a linear regression object
lm = LinearRegression()

In [None]:
# Make a copy of the train and test datasets
x_train_copy = x_train_scaled.copy()
x_test_copy = x_test_scaled.copy()

In [None]:
# Create a feature selection object
sfs1 = sfs(lm, k_features=4, forward=True, verbose=2, scoring='r2')

In [None]:
# Get the features with high predictive power
sfs1 = sfs1.fit(x_train_copy,y_train)

In [None]:
# Store the features in a list
lr_features = list(sfs1.k_feature_names_)

In [None]:
# Slice training data with the features
x_train_scaled = x_train_copy[lr_features]
x_test_scaled = x_test_copy[lr_features]

In [None]:
# Fit the regression with the inputs and targets
lm.fit(x_train_scaled,y_train)

In [None]:
# Let's check the outputs of the regression
# I'll store them in y_hat as this is the 'theoretical' name of the predictions
y_hat = lm.predict(x_test_scaled)

#### Evaluate Model

In [None]:
# Compute the metrics
reg_metrics(y_test, y_hat,x_train)

In [None]:
# Get the weights of each coefficients
relevant_features = pd.DataFrame(lm.coef_, index=x_train_scaled.columns, 
                                 columns=['Coefficients']).sort_values(by='Coefficients',ascending=False)

In [None]:
relevant_features

In [None]:
# Get the intercept
lm.intercept_

### Ridge Algorithm

In [None]:
# Initialize a repeated K-fold Cross Validator
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=365)

In [None]:
# Perform Ridge regression with repeated K-fold cross validator
ridgecv = RidgeCV(alphas=np.arange(0.1, 15, 0.1), cv=cv, scoring='neg_mean_absolute_error')

In [None]:
# Create a feature selection object
sfs1 = sfs(ridgecv, k_features=4, forward=True, verbose=2, scoring='r2')

In [None]:
# Get the features with high predictive power
sfs1 = sfs1.fit(x_train_copy,y_train)

In [None]:
# Store the features in a list
ridge_features = list(sfs1.k_feature_names_)

In [None]:
# Slice training data with the features
x_train_scaled = x_train_copy[ridge_features]
x_test_scaled = x_test_copy[ridge_features]

In [None]:
# Fitting the RidgeCV regressor
ridgecv.fit(x_train_scaled, y_train)
print("Ridge tuning parameter:", (ridgecv.alpha_))

In [None]:
# Predict the y_test values
y_hat = ridgecv.predict(x_test_scaled)

#### Evaluate Algorithm

In [None]:
# Compute the metrics
reg_metrics(y_test, y_hat,input_train)

In [None]:
# Get the weights of each coefficients
relevant_features = pd.DataFrame(ridgecv.coef_, index=x_train_scaled.columns, 
                                 columns=['Coefficients']).sort_values(by='Coefficients',ascending=False)

In [None]:
relevant_features

### Lasso Algorithm

In [None]:
# Perform Lasso regression with repeated K-fold cross validation
lassocv = LassoCV(alphas=np.arange(0.1, 15, 0.1), cv=cv, tol = 0.3)

In [None]:
# Create a feature selection object
sfs1 = sfs(lassocv, k_features=4, forward=True, verbose=2, scoring='r2')

In [None]:
# Get the features with high predictive power
sfs1 = sfs1.fit(x_train_copy,y_train)

In [None]:
# Store the features in a list
lasso_features = list(sfs1.k_feature_names_)

In [None]:
# Slice training data with the features
x_train_scaled = x_train_copy[lasso_features]
x_test_scaled = x_test_copy[lasso_features]

In [None]:
# Fitting the Lassocv regressor
lassocv.fit(x_train_scaled, y_train)
print("Lasso tuning parameter:", (lassocv.alpha_))

In [None]:
# Predict the y_test values
y_hat = lassocv.predict(x_test_scaled)

#### Evaluate the Algorithm

In [None]:
# Compute the metrics
reg_metrics(y_test, y_hat,x_train)

In [None]:
# Get the weights of each coefficients
relevant_features = pd.DataFrame(lassocv.coef_, index=x_train_scaled.columns, 
                                 columns=['Coefficients']).sort_values(by='Coefficients',ascending=False)

In [None]:
# Display the coefficients of the features
relevant_features

## Non-Parametric Models

### KNN

In [None]:
# Create a KNN regressor object
knn = KNeighborsRegressor()

In [None]:
# Create a feature selection object
sfs1 = sfs(knn, k_features=4, forward=True, verbose=2, scoring='r2')

In [None]:
# Get the features with high predictive power
sfs1 = sfs1.fit(x_train_copy,y_train)

In [None]:
sfs1.k_feature_names_

In [None]:
# Store the features in a list
knn_features = list(sfs1.k_feature_names_)

In [None]:
# Slice training data with the features
x_train_scaled = x_train_copy[knn_features]
x_test_scaled = x_test_copy[knn_features]

In [None]:
# Train the model
knn.fit(x_train_scaled, y_train_nonp)

In [None]:
# Predict the y_test values
y_hat = knn.predict(x_test_scaled)

#### Evaluate Model

In [None]:
# Compute the metrics
reg_metrics(y_test_nonp, y_hat,x_train)

### Decission Tree Algorithm

In [None]:
# Create a DecisionTree object
reg_tree = DecisionTreeRegressor(random_state=365)

In [None]:
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()

In [None]:
# Create a feature selection object
sfs1 = sfs(reg_tree, k_features=4, forward=True, verbose=2, scoring='r2')

In [None]:
# Get the features with high predictive power
sfs1 = sfs1.fit(x_train_copy,y_train)

In [None]:
# Store the features in a list
dt_features = list(sfs1.k_feature_names_)

In [None]:
# Slice training data with the features
x_train = x_train_copy[dt_features]
x_test = x_test_copy[dt_features]

In [None]:
reg_tree.fit(x_train, y_train_nonp)

In [None]:
# Predict the y_test values
y_hat = reg_tree.predict(x_test)

In [None]:
# Compute the metrics
reg_metrics(y_test_nonp, y_hat,x_train)

In [None]:
# Get the weights of each coefficients
DT_feature_importance = pd.DataFrame(reg_tree.feature_importances_, index=x_train.columns, 
                                 columns=['Importance']).sort_values(by='Importance',ascending=False)

In [None]:
DT_feature_importance

### Random Forest

In [None]:
# Create a RandomForest object
RF = RandomForestRegressor(n_estimators=200, max_features='sqrt',random_state=365)

In [None]:
# Create a feature selection object
sfs1 = sfs(RF, k_features=4, forward=True, verbose=2, scoring='r2')

In [None]:
# Get the features with high predictive power
sfs1 = sfs1.fit(x_train_copy,y_train)

In [None]:
sfs1.k_feature_names_

In [None]:
# Store the features in a list
rf_features = list(sfs1.k_feature_names_)

In [None]:
# Slice training data with the features
x_train = x_train_copy[rf_features]
x_test = x_test_copy[rf_features]

In [None]:
# Train the model
RF.fit(x_train, y_train_nonp)

In [None]:
# Predict the y_test values
y_hat = RF.predict(x_test)

#### Evaluate the Model

In [None]:
# Compute the metrics
reg_metrics(y_test_nonp, y_hat,x_train)

In [None]:
# Get the weights of each coefficients
RF_feature_importance = pd.DataFrame(RF.feature_importances_, index=x_train.columns, 
                                 columns=['Importance']).sort_values(by='Importance',ascending=False)

In [None]:
RF_feature_importance

#### Save the best performing model

In [None]:
# Save the RF model
model_save_path = "RF_ward.pkl"

with open(model_save_path,'wb') as file:
    pickle.dump(RF,file)

In [None]:
# Save the feature names of the model
model_save_path = "RF_ward_features.pkl"

with open(model_save_path,'wb') as file:
    pickle.dump(rf_features,file)

#### Generate Fiber Optics Uptake Rate Predictions for all levels of SA Administration

In [None]:
# Get a segment of the ward data
df_ward = data.loc[:,['MUNICNAME','PROVINCE','ward_code','average_income','Area','population','households','uptake_rate_pop','uptake_rate_hh']]

In [None]:
# Set the ward ID as the index
data_ward = data.set_index('ward_code')

In [None]:
# Get the inputs for ward level data
ward_input_RF = data_ward[rf_features]

In [None]:
# Generate fiber uptake rates for wards in SA using the Random Forest algorithm
predicted_ward_uptake_rate_RF = RF.predict(ward_input_RF)

In [None]:
# Convert the predicted uptake rate to a data frame
df_pred_ward_uptake_rate = pd.DataFrame(predicted_ward_uptake_rate_RF, columns=['predicted_uptake_rate'], index=data_ward.index).round(2)

In [None]:
# Reset the index
df_pred_ward_uptake_rate = df_pred_ward_uptake_rate.reset_index()
data = data.reset_index()

In [None]:
# Merge the predicted uptake rates to the demographic data
df_ward_uptake_pred = pd.merge(df_ward, df_pred_ward_uptake_rate, on=['ward_code'])

In [None]:
# Preview the sorted predicted uptake rate
df_ward_uptake_pred.sort_values(by='predicted_uptake_rate', ascending=False)

In [None]:
# Rename the ward codes
df_ward_uptake_pred = df_ward_uptake_pred.rename(columns={'ward_code': 'WARD_ID'})

In [None]:
df_ward_uptake_pred.to_csv('ward-pred-uptake-rate.csv')

### Generate Uptake Rate for Municipality

In [None]:
# Make a copy of the computed municipality uptake rate
mun_rate_copy = mun_rate.copy()
# Allign the names of both dataset municipality
mun_rate_copy['municipality'] = mun_rate_copy['municipality'].apply(lambda x: x + " Local Municipality")

In [None]:
# Select columns of interest
mun_rate_copy = mun_rate_copy[['municipality','uptake_rate_hh']]

In [None]:
# Change the case of the municipalities
mun_rate_copy['municipality'] = mun_rate_copy['municipality'].str.title()
# Rename the title case for "Of"
mun_rate_copy['municipality'] = mun_rate_copy['municipality'].str.replace('Of', 'of')

In [None]:
# Group the dataset into municipalities. This will be done in two phases as their aggregation methods are different
data_munic_featurs = data.loc[:,['MUNICNAME'] + rf_features]
df_munic_census = data.loc[:,['MUNICNAME','PROVINCE','average_income','Area','population','households']]

In [None]:
# Aggregate the municipality features
municipality_input_RF = data_munic_featurs.groupby('MUNICNAME').mean()

In [None]:
# Aggregate the municipality census data
munic_census = df_munic_census.groupby(['MUNICNAME','PROVINCE']).sum().astype(int)

In [None]:
# # Scale the inputs
# scaler = StandardScaler()

# municipality_input_RF_scaled = scaler.fit_transform(municipality_input_RF)

# municipality_input_RF_scaled = pd.DataFrame(municipality_input_RF_scaled, columns=municipality_input_RF.columns)

In [None]:
# Generate fiber uptake rates for municipalities in SA using the Random Forest algorithm
predicted_municipality_uptake_rate_RF = RF.predict(municipality_input_RF)

In [None]:
# Convert the predicted uptake rate to a data frame
df_pred_munic_uptake_rate = pd.DataFrame(predicted_municipality_uptake_rate_RF, columns=['predicted_uptake_rate'], index=municipality_input_RF.index).round(2)

In [None]:
# Reset the index of the datasets
df_pred_munic_uptake_rate = df_pred_munic_uptake_rate.reset_index()
munic_census = munic_census.reset_index()

In [None]:
# Merge the predicted uptake rates to the demographic data
df_munic_uptake_pred = pd.merge(munic_census, df_pred_munic_uptake_rate, on=['MUNICNAME'])

In [None]:
# Preview the sorted predicted uptake rate
df_munic_uptake_pred.sort_values(by='predicted_uptake_rate', ascending=False)

In [None]:
# Merge the municipality datasets
df_munic_merge = pd.merge(df_munic_uptake_pred, mun_rate_copy, how='left', left_on='MUNICNAME', right_on='municipality')

In [None]:
# Drop the municipality column
df_munic_merge = df_munic_merge.drop('municipality', axis=1)

In [None]:
# Replace missing valuse with zero for the calculated uptake rate
df_munic_merge = df_munic_merge.fillna(0)

In [None]:
df_munic_merge['uptake_rate_hh'] = df_munic_merge['uptake_rate_hh'].round(2)

In [None]:
# Preview the sorted predicted uptake rate
df_munic_merge.sort_values(by='uptake_rate_hh', ascending=False)

In [None]:
df_munic_merge.to_csv('municipality-pred-uptake-rate.csv')

### Province Uptake Rate

In [None]:
# Group the dataset into provinces. This will be done in two phases as their aggregation methods are different
data_province_featurs = data.loc[:,['PROVINCE'] + rf_features]
df_province_census = data.loc[:,['PROVINCE','average_income','Area','population','households']]

In [None]:
# Aggregate the province features
province_input_RF = data_province_featurs.groupby('PROVINCE').mean()

In [None]:
# Aggregate the province census data
province_census = df_munic_census.groupby(['PROVINCE']).sum()
province_census[['households','population']] = province_census[['households','population']].astype(int)

In [None]:
# Generate fiber uptake rates for province in SA using the Random Forest algorithm
predicted_province_uptake_rate_RF = RF.predict(province_input_RF)

In [None]:
# Convert the predicted uptake rate to a data frame
df_pred_province_uptake_rate = pd.DataFrame(predicted_province_uptake_rate_RF, columns=['predicted_uptake_rate'], index=province_input_RF.index).round(2)

In [None]:
# Reset the index of the datasets
df_pred_province_uptake_rate = df_pred_province_uptake_rate.reset_index()
province_census = province_census.reset_index()

In [None]:
# Merge the predicted uptake rates to the demographic data
df_province_uptake_pred = pd.merge(province_census, df_pred_province_uptake_rate, on=['PROVINCE'])

In [None]:
# Preview the sorted predicted uptake rate
df_province_uptake_pred.sort_values(by='predicted_uptake_rate', ascending=False)

In [None]:
df_province_uptake_pred.to_csv('province-pred-uptake-rate.csv')