In [38]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from math import sqrt
from scipy import stats

In [2]:
df = pd.read_excel('combined_5features.xlsx')

In [3]:
df.columns

Index(['state_county', 'label', 'state_name', 'county_name',
       'Land_area (sq miles)', 'life_exp', 'income', 'inflow_exmpt_num',
       'housing_units', 'Population_total'],
      dtype='object')

In [4]:
# The feature 'inflow_exmpt_num' is not used because it contains -1 values
data = df[['Land_area (sq miles)', 'life_exp', 'income', 'housing_units', 'Population_total']]
data.shape

(3143, 5)

In [5]:
data[:10]

Unnamed: 0,Land_area (sq miles),life_exp,income,housing_units,Population_total
0,594.44,75.32,24568.0,22135,54571
1,1589.78,77.48,26469.0,104061,182265
2,884.88,73.97,15875.0,11829,27457
3,622.58,73.55,19918.0,8981,22915
4,644.78,76.08,21070.0,23887,57322
5,622.8,77.6,20289.0,4493,10914
6,776.83,73.39,16916.0,9964,20947
7,605.87,73.14,20574.0,53289,118572
8,596.53,73.61,16626.0,17004,34215
9,553.7,74.08,21322.0,16267,25989


In [6]:
X = data.loc[:, :'housing_units']
X.shape

(3143, 4)

In [7]:
# impute missing data with mean
for column in X.columns:
    X[column].fillna(X[column].mean(), inplace=True)

In [8]:
X.describe()

Unnamed: 0,Land_area (sq miles),life_exp,income,housing_units
count,3143.0,3143.0,3143.0,3143.0
mean,1123.736958,77.686997,22505.447486,41904.15
std,3611.418224,2.544701,5408.503948,122768.2
min,2.0,69.05,7772.0,50.0
25%,430.725,75.95,19030.0,5416.5
50%,615.63,77.686997,21777.0,12162.0
75%,923.955,79.345,24813.5,30573.5
max,145504.79,89.5,64381.0,3445076.0


In [9]:
X.isna().sum()

Land_area (sq miles)    0
life_exp                0
income                  0
housing_units           0
dtype: int64

In [10]:
y = data['Population_total']
y.shape

(3143,)

In [11]:
y.isna().sum()

0

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [13]:
X_log_train = np.log(X_train)
X_log_test = np.log(X_test)
y_log_train = np.log(y_train)
y_log_test = np.log(y_test)

In [14]:
lm_log = linear_model.LinearRegression()
model = lm_log.fit(X_log_train, y_log_train)

In [15]:
lm_log.coef_

array([-0.02530717,  0.51716738, -0.26568577,  1.06098203])

In [16]:
lm_log.intercept_

0.7406667423834303

In [17]:
y_log_fit = lm_log.predict(X_log_train)

In [18]:
y_fit = np.exp(y_log_fit)

In [19]:
r2_train = r2_score(y_train, y_fit)
r2_train

0.9894043995907322

In [20]:
rmse_train = sqrt(mean_squared_error(y_train, y_fit))
rmse_train

32969.375202035466

In [21]:
mae_train = mean_absolute_error(y_train, y_fit)
mae_train

10166.284339860542

In [22]:
y_log_pred = lm_log.predict(X_log_test)

In [23]:
y_pred = np.exp(y_log_pred)

In [24]:
rmse_test = sqrt(mean_squared_error(y_test, y_pred))
rmse_test

34348.05385795133

In [25]:
mae_test = mean_absolute_error(y_test, y_pred)
mae_test

9454.63174673248

In [26]:
r2_test = r2_score(y_test, y_pred)
r2_test

0.9864208018258842

In [27]:
y_log_pred_all = lm_log.predict(np.log(X))

In [28]:
y_pred_all = np.exp(y_log_pred_all)

In [29]:
y[:10]

0     54571
1    182265
2     27457
3     22915
4     57322
5     10914
6     20947
7    118572
8     34215
9     25989
Name: Population_total, dtype: int64

In [30]:
y_pred_all[:10]

array([ 46316.09961042, 232205.20004881,  26239.74444736,  18554.44844508,
        52470.55197224,   9103.80444898,  21489.97296241, 121401.3739513 ,
        38377.64878059,  34451.2432719 ])

In [31]:
stats.describe(y_pred_all)

DescribeResult(nobs=3143, minmax=(63.30439526588056, 9400995.180035794), mean=98090.86370878018, variance=99903003735.08351, skewness=13.61770586188247, kurtosis=301.0667670318177)

In [32]:
df['Population_pred'] = y_pred_all[:]
df[:10]

Unnamed: 0,state_county,label,state_name,county_name,Land_area (sq miles),life_exp,income,inflow_exmpt_num,housing_units,Population_total,Population_pred
0,1001,United States - Alabama - Autauga County,Alabama,Autauga County,594.44,75.32,24568.0,4293,22135,54571,46316.09961
1,1003,United States - Alabama - Baldwin County,Alabama,Baldwin County,1589.78,77.48,26469.0,9517,104061,182265,232205.200049
2,1005,United States - Alabama - Barbour County,Alabama,Barbour County,884.88,73.97,15875.0,997,11829,27457,26239.744447
3,1007,United States - Alabama - Bibb County,Alabama,Bibb County,622.58,73.55,19918.0,942,8981,22915,18554.448445
4,1009,United States - Alabama - Blount County,Alabama,Blount County,644.78,76.08,21070.0,2572,23887,57322,52470.551972
5,1011,United States - Alabama - Bullock County,Alabama,Bullock County,622.8,77.6,20289.0,403,4493,10914,9103.804449
6,1013,United States - Alabama - Butler County,Alabama,Butler County,776.83,73.39,16916.0,690,9964,20947,21489.972962
7,1015,United States - Alabama - Calhoun County,Alabama,Calhoun County,605.87,73.14,20574.0,4151,53289,118572,121401.373951
8,1017,United States - Alabama - Chambers County,Alabama,Chambers County,596.53,73.61,16626.0,1531,17004,34215,38377.648781
9,1019,United States - Alabama - Cherokee County,Alabama,Cherokee County,553.7,74.08,21322.0,920,16267,25989,34451.243272


In [33]:
df.to_csv('linear_regression_pred_log-transform.csv', index=False)

In [34]:
y_pred_all[y_pred_all <= 100]

array([63.30439527])

In [37]:
mean_absolute_error(y, y_pred_all)

9952.765919459729

In [39]:
median_absolute_error(y,y_pred_all)

2047.909547165429

In [40]:
r2_score(y,y_pred_all)

0.9886098200518557