# Preprocessing and Modelling Pre-Omicron

In [3]:

import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
import seaborn as sns

import csv 
from collections import Counter
import datetime
import holidays
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from scipy.sparse import csr_matrix

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices

from lineartree import LinearTreeRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, explained_variance_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor



In [51]:
# When first modelling, I had one notebook. I ran the first linear models against the original data and just the 
#school-aged population and school status columns. Below, I am running them with all my engineered features.

merged_df_pre_omi = pd.read_csv('data/pre_omi.csv')

merged_df_pre_omi['Test Date'] = pd.to_datetime(merged_df_pre_omi['Test Date'])

merged_df_pre_omi['School Status'] = merged_df_pre_omi['School Status'].astype('object')
merged_df_pre_omi['School-Aged Population']


0        72091.0
1       122994.0
2       117865.0
3       151282.0
4       131881.0
          ...   
1479     46435.0
1480     33636.0
1481     29986.0
1482     74748.0
1483    240219.0
Name: School-Aged Population, Length: 1484, dtype: float64

In [52]:
# Simple preprocessing. Not going to worry about scaling right now. Just attempting to determine if my data is
# strong enough to warrant continuing.

cat_cols = []

for i in merged_df_pre_omi.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Test Date' or i == 'Holiday':
        pass
    elif merged_df_pre_omi[i].dtype == 'object':
        cat_cols.append(i)
    elif merged_df_pre_omi[i].dtype == 'float64' or 'int64':
        print(i)
    else:
        print("error")


cat_transformer = Pipeline(steps=[  
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))                     
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols),
    ])

y = merged_df_pre_omi['Gene Copies (N1/L)']

X = merged_df_pre_omi.drop('Gene Copies (N1/L)', axis=1)

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


Per Capita Gene Copies
Population Served, estimated
School-Aged Population


In [53]:
# Setting up columns to be onehot encoded. We will look at numerical transformation later




# Have to drop the NaNs, otherwise linear model won't work
y_train = y_train.dropna()
X_train = X_train.dropna()
X_test = X_test.dropna()
y_test = y_test.dropna()

In [54]:
linreg = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

linreg.fit(X_train, y_train)


y_pred=linreg.predict(X_test)

linreg.score(X_train,y_train)



0.6932029480436093

In [55]:
linear_model = linreg.named_steps['model']

# Print the coefficients along with column names
for feature_name, coef in zip(X.columns, linear_model.coef_):
    print(f"{feature_name}: {coef}")

Sample Date: -242.25819769575236
Test Date: -1213.8680692830374
WRRF Name: -450.36133955081175
Per Capita Gene Copies: -88.97907742483167
Population Served, estimated: -816.8782392129317
School Status: -599.8194016551173
School-Aged Population: 1037.239553925906
Season: -2612.7220380119747
Holiday: -1610.8580814343418


In [57]:
# Let's try again but with log-transformed targets
y_train_2 = np.log(y_train)
y_test_2 = np.log(y_test)

In [58]:
linreg_2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

linreg_2.fit(X_train, y_train_2)

y_pred_2=linreg_2.predict(X_test)

linreg_2.score(X_train,y_train_2)



0.7944152204682045

In [59]:
linear_model = linreg_2.named_steps['model']

# Print the coefficients along with column names
for feature_name, coef in zip(X.columns, linear_model.coef_):
    print(f"{feature_name}: {coef}")

Sample Date: 0.021644905949258136
Test Date: -0.95321807547912
WRRF Name: -0.5068181306940807
Per Capita Gene Copies: 0.19383371165303492
Population Served, estimated: -0.038487768314116724
School Status: -0.17055514483224704
School-Aged Population: 0.65869233028895
Season: -0.7364247507400248
Holiday: -0.24013593876863606


In [19]:
# With the addition of jewish holidays, got a 1% boost in the r2 score. Might get a little nudge with islamic, but
# probably not much.

Index(['Sample Date', 'Test Date', 'WRRF Name', 'Gene Copies (N1/L)',
       'Per Capita Gene Copies', 'Population Served, estimated',
       'School Status', 'School-Aged Population', 'Season', 'Holiday'],
      dtype='object')

In [61]:
# Trying without the highly-correlated features
copy_df = merged_df_pre_omi.copy()

copy_df.dropna(inplace=True)

y = copy_df['Gene Copies (N1/L)']


X = copy_df.drop(columns=['Gene Copies (N1/L)','Population Served, estimated', 'Per Capita Gene Copies'], axis=1)

# Split again
X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


# Log transforming the target

y_test = np.log(y_test)
y_train = np.log(y_train)

In [62]:
# 3rd iteration, no adjustments to the model itself

linreg_3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

linreg_3.fit(X_train, y_train)

y_pred = linreg_3.predict(X_test)

linreg_3.score(X_train,y_train)

0.7975940643717525

In [63]:
linear_model = linreg_3.named_steps['model']

# Print the coefficients along with column names
for feature_name, coef in zip(X.columns, linear_model.coef_):
    print(f"{feature_name}: {coef}")

Sample Date: -0.008693670534421008
Test Date: -0.9950867642384545
WRRF Name: -0.5303994956634813
School Status: 0.47678370245459856
School-Aged Population: -0.4366421136657676
Season: 0.05649508602779869
Holiday: 0.6610960179303406


In [23]:
# Slightly worse score without the non-school population data. So even though it's correlated, the general population
# data is not boosting the score much. 

In [67]:
# Let's look at just the school-related data.

copy_df_2 = merged_df_pre_omi.copy()
copy_df_2.dropna(inplace=True)

y = copy_df_2['Gene Copies (N1/L)']

X = copy_df_2[['School-Aged Population', 'School Status']]


# We only have one column to transform in this version

cat_cols_2 = ['School Status']

preprocessor_2 = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols_2),
    ])

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [68]:
# 4th model - this one is actually different!

linreg_4 = Pipeline([
    ('preprocessor', preprocessor_2),
    ('model', LinearRegression())
])

linreg_4.fit(X_train, y_train)

y_pred = linreg_4.predict(X_test)

linreg_4.score(X_train,y_train)

0.46997170371988195

In [69]:
# Ok, so with just the school-aged population and school status, our model explains nearly half the variance! And this
# is before log transformation!


In [70]:
# Same model, log-transformed target data.

y_train = np.log(y_train)
y_test = np.log(y_test)

linreg_5 = Pipeline([
    ('preprocessor', preprocessor_2),
    ('model', LinearRegression())
])

linreg_5.fit(X_train, y_train)

y_pred = linreg_5.predict(X_test)

print(linreg_5.score(X_train,y_train))
print(r2_score(y_test, y_pred))

0.5422711234817856
0.5143954687950946


In [71]:
# Ok, our train tests scores are pretty close! Not too much overfitting.


In [72]:
# New X and y

copy_df_3 = merged_df_pre_omi.copy()
copy_df_3.dropna(inplace=True)
y = copy_df_3['Gene Copies (N1/L)']

X = copy_df_3['School-Aged Population']


# Split again

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# log transform again

y_train = np.log(y_train)
y_test = np.log(y_test)

In [73]:
# using ols because we are not preprocessing this data


X_int = sm.add_constant(X_train)
results = sm.OLS(y_train, X_int).fit()
summary = results.summary()
print(summary)

influence = OLSInfluence(results)
print(influence.resid_studentized)

# well, this isn't very good! This is just school-aged population, though, and not the actual status of schools

                            OLS Regression Results                            
Dep. Variable:     Gene Copies (N1/L)   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     8.238
Date:                Mon, 31 Jul 2023   Prob (F-statistic):            0.00418
Time:                        10:58:54   Log-Likelihood:                -1784.3
No. Observations:                1097   AIC:                             3573.
Df Residuals:                    1095   BIC:                             3583.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      8

In [74]:
# Wonder how predictive the baseline model is. Let's just look at the original population data, which is one of
# the most relevant features. 

copy_df_4 = merged_df_pre_omi.copy()
copy_df_4.dropna(inplace=True)


y = copy_df_4['Gene Copies (N1/L)']

X = copy_df_4['Population Served, estimated']

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

In [75]:
X_int = sm.add_constant(X_train)
model_2 = sm.OLS(y_train, X_int).fit()
summary = model_2.summary()
summary

0,1,2,3
Dep. Variable:,Gene Copies (N1/L),R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,11.33
Date:,"Mon, 31 Jul 2023",Prob (F-statistic):,0.000791
Time:,10:58:56,Log-Likelihood:,-1782.7
No. Observations:,1097,AIC:,3569.0
Df Residuals:,1095,BIC:,3579.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.0670,0.075,106.995,0.000,7.919,8.215
"Population Served, estimated",3.618e-07,1.08e-07,3.365,0.001,1.51e-07,5.73e-07

0,1,2,3
Omnibus:,61.143,Durbin-Watson:,2.034
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.686
Skew:,-0.33,Prob(JB):,3.98e-09
Kurtosis:,2.359,Cond. No.,1420000.0


In [78]:
copy_df_5 = merged_df_pre_omi.copy()
copy_df_5.dropna(inplace=True)
y = copy_df_5['Gene Copies (N1/L)']

X = copy_df_5[['Population Served, estimated', 'WRRF Name', 'Per Capita Gene Copies', 'Test Date']]

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


y_train = np.log(y_train)
y_test = np.log(y_test)

In [79]:
cat_cols_3 = ['WRRF Name']

cat_transformer = Pipeline(steps=[  
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))                     
])

preprocessor_3 = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols_3),
    ])

linreg_6 = Pipeline([
    ('preprocessor', preprocessor_3),
    ('model', LinearRegression())
])

linreg_6.fit(X_train, y_train)

y_pred = linreg_6.predict(X_test)

print(linreg_6.score(X_train,y_train))
print(r2_score(y_test, y_pred))

0.07123753415992828
0.04505102157812135


In [80]:
# So yeah, the basic info from the original dataset explains almost nothing. 

In [92]:
# Want to use the index (sample date) as a feature. Let's also save this data file as a csv, since
# it's what we ultimately want to use.

sample_date_df = merged_df_pre_omi.copy()
sample_date_df['Sample Date'] = pd.to_datetime(sample_date_df['Sample Date'])

# sample_date_df.to_csv('data/master_wastewater.csv', index=False)

#(Ending up abandoning dates at features, but in future work would like to integrate them.)

In [93]:
# Fancier models! Going to move on to more elaborate models, including Random Forest.

# Re-doing the columns for preprocessing since we have different features

date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        


print(date_cols)
print(cat_cols)
print(num_cols)
# leaving "holiday" out because we don't want to transform this binary data

['Sample Date', 'Test Date']
['WRRF Name', 'School Status', 'Season']
['Per Capita Gene Copies', 'Population Served, estimated', 'School-Aged Population']


In [94]:
# Allows for fit and transformation of linear features in a pipeline. Were originally trying to combine
# LinearRegressor with RandomForest using FeatureUnion, but was unsuccessful

class ColumnSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]
    

In [95]:
# New X and y, split

sample_copy_df = sample_date_df.copy()
sample_copy_df.dropna(inplace=True)

X = sample_copy_df.drop('Gene Copies (N1/L)', axis=1)

y = sample_copy_df['Gene Copies (N1/L)']


X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)


In [96]:
sample_copy_df.columns

Index(['Sample Date', 'Test Date', 'WRRF Name', 'Gene Copies (N1/L)',
       'Per Capita Gene Copies', 'Population Served, estimated',
       'School Status', 'School-Aged Population', 'Season', 'Holiday'],
      dtype='object')

In [97]:
# New preprocessors

cat_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessor = Pipeline([
    ('selector', ColumnSelector(columns=num_cols)), 
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_preprocessor, cat_cols),
    ('num', numeric_preprocessor, num_cols),
])


rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  RandomForestRegressor(random_state = 42))
])





In [98]:
# Just going with a vanilla Random Forest model after trying to combine a linear regression and random forest.


rf_pipeline.fit(X_train, y_train)


y_pred = rf_pipeline.predict(X_test)

print(f'R-squared score on training data: {rf_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2): {explained_variance_score(y_test, y_pred)}')

R-squared score on training data: 0.9951082673198972
R2 test score: 0.9675638498965449
Mean squared error: 0.052094098364203714
Mean absolute error: 0.15678014488775047
Mean absolute percentage error: 0.019586285838186914
Explained variance score (modified R2): 0.9675841799049261


In [99]:
# Wow, 99.5%! 

In [100]:
# Let's look at feature importances:

rf_model = rf_pipeline.named_steps['model']
features = []
scores = []

# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, rf_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

0.0033534293203989632: Sample Date
0.0013350698361288351: Holiday
0.0013342269200147376: Test Date
0.001027440180672196: School-Aged Population
0.00101529776794821: Season
0.0008777949385026092: Population Served, estimated
0.0006330406638788463: Per Capita Gene Copies
0.00044756918311024147: WRRF Name
0.00036113194992653256: School Status


In [101]:
# Cross-validating, because we ultimately want to use this on other data!
cross_validate(rf_pipeline, X_train, y_train, return_train_score=True)

{'fit_time': array([0.81501698, 0.80086398, 0.79063416, 0.79651999, 0.81004596]),
 'score_time': array([0.01636505, 0.01608706, 0.01679897, 0.01606894, 0.01622081]),
 'test_score': array([0.97273797, 0.96571812, 0.96220805, 0.96419641, 0.95084884]),
 'train_score': array([0.99458214, 0.99494814, 0.99498579, 0.99466468, 0.99549897])}

In [102]:
# Want to try this model again without the date features, since they seem to be over-determining.

sample_copy_df_2 = sample_date_df.copy()

sample_copy_df_2.dropna(inplace=True)

X = sample_copy_df_2.drop(['Gene Copies (N1/L)', 'Sample Date', 'Test Date'], axis=1)

y = sample_copy_df_2['Gene Copies (N1/L)']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

In [None]:
date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        


print(date_cols)
print(cat_cols)
print(num_cols)

In [104]:
cat_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessor = Pipeline([
    ('selector', ColumnSelector(columns=num_cols)), 
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_preprocessor, cat_cols),
    ('num', numeric_preprocessor, num_cols),
])


rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  RandomForestRegressor(random_state = 42))
])



In [105]:
rf_pipeline.fit(X_train, y_train)  # Same pipeline, different data


y_pred = rf_pipeline.predict(X_test)


print(f'R-squared score on training data: {rf_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2): {explained_variance_score(y_test, y_pred)}')

R-squared score on training data: 0.9951082673198972
R2 test score: 0.9675638498965449
Mean squared error: 0.052094098364203714
Mean absolute error: 0.15678014488775047
Mean absolute percentage error: 0.019586285838186914
Explained variance score (modified R2): 0.9675841799049261


In [106]:
# The very same! Our engineered features are very strong all around. Let's look at feature importance.

rf_model = rf_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, rf_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

0.0033534293203989632: WRRF Name
0.0013342269200147376: Per Capita Gene Copies
0.001027440180672196: Holiday
0.0008777949385026092: School-Aged Population
0.0006330406638788463: School Status
0.00044756918311024147: Population Served, estimated
0.00036113194992653256: Season


In [107]:
# Let's try another model type, using all features

sample_copy_df = sample_date_df.copy()

X = sample_copy_df.drop('Gene Copies (N1/L)', axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X.dropna(inplace=True)
y.dropna(inplace=True)

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

In [108]:

grad = GradientBoostingRegressor()
grad_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  grad)
])


grad_pipeline.fit(X_train, y_train)
y_pred = grad_pipeline.predict(X_test)

print(f'R-squared score on training data: {grad_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')

R-squared score on training data: 0.9814959729055575
R2 test score: 0.9723183570900568
Mean squared error: 0.04445811922912941
Mean absolute error: 0.14413801082084243
Mean absolute percentage error: 0.01798534397514475
Explained variance score (modified R2 test score): 0.9723229599966545


In [109]:
# Pretty similar!

In [110]:
grad_model = grad_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, grad_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

0.0039642306151674404: Sample Date
0.0018551912240568251: Holiday
0.0010896138197174696: School-Aged Population
0.0007320431865119407: Season
0.0006780813323552669: Population Served, estimated
0.0004589845674212528: Test Date
0.0003816646083538947: Per Capita Gene Copies
1.2022278342754647e-05: School Status
5.498453807413116e-06: WRRF Name


In [111]:
# Very different weighing of features here, and slightly better test score. Let's look at loss scores, too:
train_score = grad_pipeline['model'].train_score_
train_score


# If we ran maybe double the iteratations, our loss score should approach .2

array([1.2546271 , 1.0343755 , 0.85524127, 0.70958707, 0.59109145,
       0.4945883 , 0.41586988, 0.35165381, 0.29935438, 0.25651436,
       0.221891  , 0.19355449, 0.17034788, 0.15123911, 0.13525078,
       0.12200857, 0.11088258, 0.10184361, 0.09408602, 0.08763019,
       0.08227315, 0.07697006, 0.07258206, 0.0690886 , 0.06602022,
       0.06324645, 0.06088238, 0.05882038, 0.05690847, 0.05521488,
       0.05357605, 0.05218039, 0.05038489, 0.04914069, 0.04801434,
       0.04685401, 0.04577433, 0.04487213, 0.04399308, 0.04311993,
       0.0423678 , 0.04161209, 0.04089222, 0.03969664, 0.03909103,
       0.03846022, 0.03794208, 0.03745044, 0.03702689, 0.03632175,
       0.03593234, 0.03559401, 0.03521393, 0.03492283, 0.03463449,
       0.0343394 , 0.03404891, 0.03374259, 0.03347791, 0.03326131,
       0.03307398, 0.03287956, 0.03270017, 0.0324756 , 0.03231803,
       0.03210624, 0.03182856, 0.03168736, 0.03153384, 0.03138392,
       0.0312248 , 0.03109716, 0.03097769, 0.03087304, 0.03076

In [112]:
# Let's get rid of all date features with this model and see what happens

sample_copy_df = sample_date_df.copy()

X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date'], axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X.dropna(inplace=True)
y.dropna(inplace=True)

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

grad = GradientBoostingRegressor()
grad_best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  grad)
])

grad_best_pipeline.fit(X_train, y_train)
y_pred = grad_best_pipeline.predict(X_test)

print(f'R-squared score on training data: {grad_best_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')

R-squared score on training data: 0.9814959729055575
R2 test score: 0.972319965822545
Mean squared error: 0.04445553552334996
Mean absolute error: 0.14412556858772274
Mean absolute percentage error: 0.017984162569847115
Explained variance score (modified R2 test score): 0.9723245266981202


In [113]:
# Very similar scores, with less over-fitting. Good! # Let's examine feature importance.

grad_model = grad_best_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, grad_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

0.003964230615167441: WRRF Name
0.0010896138197174707: Holiday
0.0006791720086939195: School-Aged Population
0.0004589845674212526: Per Capita Gene Copies
0.0003816646083538939: School Status
1.1101917699992118e-05: Season
5.498453807413116e-06: Population Served, estimated


In [114]:
# Going to remove highly-correlated non-engineered features, to better see how strong the model is with my contibutions.

sample_copy_df = sample_date_df.copy()

sample_copy_df.dropna(inplace=True)

X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date', 'Population Served, estimated', 'Per Capita Gene Copies'], axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

In [115]:
# Re-do preprocesing as we have different features
date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday' or i == 'Population Served, estimated' or i == 'Per Capita Gene Copies':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        


print(date_cols)
print(cat_cols)
print(num_cols)
# leaving "holiday" out because we don't want to transform this binary data

['Sample Date', 'Test Date']
['WRRF Name', 'School Status', 'Season']
['School-Aged Population']


In [116]:
cat_transformer = Pipeline(steps=[  
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))                     
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols),
    ])

grad = GradientBoostingRegressor()
grad_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  grad)
])

grad_pipeline.fit(X_train, y_train)
y_pred = grad_pipeline.predict(X_test)

print(f'R-squared score on training data: {grad_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')

R-squared score on training data: 0.6596607521088087
R2 test score: 0.6154788665536127
Mean squared error: 0.6175603974263713
Mean absolute error: 0.6082857308635666
Mean absolute percentage error: 0.07673133747053865
Explained variance score (modified R2 test score): 0.6176639745109003


In [117]:
# Looking at feature importances

grad_model = grad_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, grad_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

0.01684000976720728: School-Aged Population
0.007501852048518024: Holiday
0.005086662556497535: School Status
0.0019261258027081182: WRRF Name
0.0015815597356382753: Season


In [118]:
# By removing correlated features, our score was reduced pretty dramatically. However, there seem to be important
# elements of the non-date correlated features that we should keep, because although they are similar, they tell us
# important things about the data at particular times. For instance, sudden wastewater spikes around school events
# in places with high school-aged populations. Over time, these effects are expected to flatten out and/or become
# more cyclical. One avenue to pursue later would be population estimates of those who left the city during 2020/2021
# but who were never officially non-residents, and thus not reflected in the Census Bureau's data. We would expect much more of this in wealthier zip codes and in zip codes
# where there are fewer children. 

In [119]:
# Setting up for grid search of our best model so far. Re-doing preprocessing since using different features

date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        
print(date_cols)
print(cat_cols)
print(num_cols)

cat_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessor = Pipeline([
    ('selector', ColumnSelector(columns=num_cols)), 
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer([
    ('cat', cat_preprocessor, cat_cols),
    ('num', numeric_preprocessor, num_cols),
])


['Sample Date', 'Test Date']
['WRRF Name', 'School Status', 'Season']
['Per Capita Gene Copies', 'Population Served, estimated', 'School-Aged Population']


In [120]:
# Removing the datetime info and re-running grid search on our best model.

sample_copy_df = sample_date_df.copy()
sample_copy_df.dropna(inplace=True)

X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date'], axis=1)

y = sample_copy_df['Gene Copies (N1/L)']


X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)
grad = GradientBoostingRegressor()

param_grid = {
    'model__learning_rate': [.01, .1, .3],
    'model__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'model__n_estimators': [100, 150, 200, 250],
    'model__subsample': [.3, .5, .7, 1.0],
    'model__criterion': ['friedman_mse', 'squared_error']
    
}
                            
grid_search_gbc = GridSearchCV(
    estimator = grad_best_pipeline,  # pipeline 
    param_grid = param_grid,
    cv= 5,
    scoring='explained_variance'  # internal scoring term
)

grid_search_gbc.fit(X_train, y_train)

cv_score = grid_search_gbc.best_score_
test_score = r2_score(y_test, grid_search_gbc.predict(X_test))

print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print(grid_search_gbc.best_params_)

Cross-validation score: 0.9728682096542398
Test score: 0.9723028481450525
{'model__criterion': 'friedman_mse', 'model__learning_rate': 0.1, 'model__loss': 'absolute_error', 'model__n_estimators': 250, 'model__subsample': 0.5}


In [121]:
# Thought I was done, but want to try one more model type: LinearTrees!

sample_copy_df = sample_date_df.copy()

sample_copy_df.dropna(inplace=True)

X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date'], axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


y_train = np.log(y_train)
y_test = np.log(y_test)

In [122]:

lintree = LinearTreeRegressor(base_estimator=LinearRegression())

lintree_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('to_sparse', FunctionTransformer(csr_matrix, validate=False)),  # convert to sparse
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), validate=False)), # convert to dense. lintree requires this
    ('model',  lintree)
])

lintree_pipeline.fit(X_train, y_train)

y_pred = lintree_pipeline.predict(X_test)

print(f'R-squared score on training data: {lintree_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')

# Well, that was a terrible model for this data, but seemed interesting! 

R-squared score on training data: 0.9802052054633759
R2 test score: -1184.0720230715035
Mean squared error: 1903.285634751599
Mean absolute error: 3.5056331972161354
Mean absolute percentage error: 0.4220202108653636
Explained variance score (modified R2 test score): -1177.0278641925847
