# Preprocessing and Modelling Pre-Omicron

In [1]:

import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
import seaborn as sns

import csv 
from collections import Counter
import datetime
import holidays
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from scipy.sparse import csr_matrix

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices

from lineartree import LinearTreeRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, explained_variance_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor



#### When first modelling, I had one notebook. I ran the first linear models against the original data and just the school-aged population and school status columns. Below, I am running them with all my engineered features.


In [2]:
merged_df_pre_omi = pd.read_csv('data/pre_omi.csv')

merged_df_pre_omi['Test Date'] = pd.to_datetime(merged_df_pre_omi['Test Date'])

merged_df_pre_omi['School Status'] = merged_df_pre_omi['School Status'].astype('object')
merged_df_pre_omi['School-Aged Population']

0        72091.0
1       122994.0
2       117865.0
3       151282.0
4       131881.0
          ...   
1479     46435.0
1480     33636.0
1481     29986.0
1482     74748.0
1483    240219.0
Name: School-Aged Population, Length: 1484, dtype: float64

#### Simple preprocessing. Not going to worry about scaling right now. Just attempting to determine if my features are  strong enough to warrant continuing.

In [3]:
cat_cols = []

for i in merged_df_pre_omi.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Test Date' or i == 'Holiday' or i == 'Sample Date':
        pass
    elif merged_df_pre_omi[i].dtype == 'object':
        cat_cols.append(i)
        print(i)
    elif merged_df_pre_omi[i].dtype == 'float64' or merged_df_pre_omi[i].dtype == 'int64':
        print(i)
    else:
        print("error")


cat_transformer = Pipeline(steps=[  
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))                     
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols),
    ])

y = merged_df_pre_omi['Gene Copies (N1/L)']

X = merged_df_pre_omi.drop('Gene Copies (N1/L)', axis=1)

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


WRRF Name
Per Capita Gene Copies
Population Served, estimated
School Status
School-Aged Population
Season


In [5]:
# Setting up columns to be onehot encoded. We will look at numerical transformation later

Have to drop the NaNs, otherwise linear model won't work
y_train = y_train.dropna()
X_train = X_train.dropna()
X_test = X_test.dropna()
y_test = y_test.dropna()

linreg = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

linreg.fit(X_train, y_train)


y_pred=linreg.predict(X_test)

linreg.score(X_train,y_train)



0.5587991188685084

In [6]:
linear_model = linreg.named_steps['model']

# Print the coefficients along with column names
for feature_name, coef in zip(X.columns, linear_model.coef_):
    print(f"{feature_name}: {coef}")

Sample Date: -1070.2982270060952
Test Date: 1000.1580037146778
WRRF Name: 1133.043333951316
Per Capita Gene Copies: -729.9289403163428
Population Served, estimated: 3534.929886399773
School Status: -1595.292150257837
School-Aged Population: -2604.654484580207
Season: 1246.104726306294
Holiday: 6245.974160965021


#### Let's try again but with log-transformed targets

In [10]:
linear_model = linreg_2.named_steps['model']

# Print the coefficients along with column names
for feature_name, coef in zip(X.columns, linear_model.coef_):
    print(f"{feature_name}: {coef}")

Sample Date: -0.0440007780691526
Test Date: 0.20732123218866916
WRRF Name: 0.19162232107698013
Per Capita Gene Copies: -0.14624733932704054
Population Served, estimated: 0.31736815887438546
School Status: -0.1297000508508879
School-Aged Population: -0.3831365358061128
Season: 0.4055824921338149
Holiday: 0.7947331217220195


#### With the addition of jewish holidays, got a 1% boost in the r2 score. Might get a little nudge with Islamic, but probably not much.

#### Trying without the highly-correlated features

In [11]:
copy_df = merged_df_pre_omi.copy()

copy_df.dropna(inplace=True)

y = copy_df['Gene Copies (N1/L)']


X = copy_df.drop(columns=['Gene Copies (N1/L)','Population Served, estimated', 'Per Capita Gene Copies'], axis=1)

# Split again
X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


# Log transforming the target

y_test = np.log(y_test)
y_train = np.log(y_train)

# 3rd iteration, no adjustments to the model itself

linreg_3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

linreg_3.fit(X_train, y_train)

y_pred = linreg_3.predict(X_test)

linreg_3.score(X_train,y_train)

0.6177258476117596

In [12]:
linear_model = linreg_3.named_steps['model']

# Print the coefficients along with column names
for feature_name, coef in zip(X.columns, linear_model.coef_):
    print(f"{feature_name}: {coef}")

Sample Date: -0.14757150645203634
Test Date: 0.17730798873563733
WRRF Name: 0.21434747417862648
School Status: -0.1206607271795389
School-Aged Population: 0.28797817715926266
Season: -0.1298456319366709
Holiday: -0.3610885764908265


#### Slightly worse score without the non-school population data. So even though it's correlated, the general population data is not boosting the score much. 

#### Let's look at just the school-related data.

In [13]:
copy_df_2 = merged_df_pre_omi.copy()
copy_df_2.dropna(inplace=True)

y = copy_df_2['Gene Copies (N1/L)']

X = copy_df_2[['School-Aged Population', 'School Status']]


# We only have one column to transform in this version

cat_cols_2 = ['School Status']

preprocessor_2 = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols_2),
    ])

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [14]:
# 4th model - this one is actually different!

linreg_4 = Pipeline([
    ('preprocessor', preprocessor_2),
    ('model', LinearRegression())
])

linreg_4.fit(X_train, y_train)

y_pred = linreg_4.predict(X_test)

linreg_4.score(X_train,y_train)

0.46997170371988195

#### With just the school-aged population and school status, our model explains nearly half the variance! And this is before log transformation!


In [15]:
# Same model, log-transformed target data.
y_train = np.log(y_train)
y_test = np.log(y_test)

linreg_5 = Pipeline([
    ('preprocessor', preprocessor_2),
    ('model', LinearRegression())
])

linreg_5.fit(X_train, y_train)

y_pred = linreg_5.predict(X_test)

print(linreg_5.score(X_train,y_train))
print(r2_score(y_test, y_pred))

0.5422711234817856
0.5143954687950946


#### Ok, our train tests scores are pretty close! Not too much overfitting.


In [None]:
# New X and y

copy_df_3 = merged_df_pre_omi.copy()
copy_df_3.dropna(inplace=True)
y = copy_df_3['Gene Copies (N1/L)']

X = copy_df_3['School-Aged Population']


# Split again

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# log transform again

y_train = np.log(y_train)
y_test = np.log(y_test)

# using ols because we are not preprocessing this data
X_int = sm.add_constant(X_train)
results = sm.OLS(y_train, X_int).fit()
summary = results.summary()
print(summary)

influence = OLSInfluence(results)
print(influence.resid_studentized)

#### Well, this isn't very good! This is just school-aged population, though, and not the actual status of schools

#### Wonder how predictive the baseline model is. Let's just look at the original population data, which is one ofthe most relevant features. 

In [None]:
copy_df_4 = merged_df_pre_omi.copy()
copy_df_4.dropna(inplace=True)

y = copy_df_4['Gene Copies (N1/L)']
X = copy_df_4['Population Served, estimated']

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

X_int = sm.add_constant(X_train)
model_2 = sm.OLS(y_train, X_int).fit()

summary = model_2.summary()
summary

In [None]:
copy_df_5 = merged_df_pre_omi.copy()
copy_df_5.dropna(inplace=True)
y = copy_df_5['Gene Copies (N1/L)']

X = copy_df_5[['Population Served, estimated', 'WRRF Name', 'Per Capita Gene Copies', 'Sample Date', 'Test Date']]

X_train ,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


y_train = np.log(y_train)
y_test = np.log(y_test)

In [None]:
cat_cols_3 = ['WRRF Name']

cat_transformer = Pipeline(steps=[  
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))                     
])

preprocessor_3 = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols_3),
    ])

linreg_6 = Pipeline([
    ('preprocessor', preprocessor_3),
    ('model', LinearRegression())
])

linreg_6.fit(X_train, y_train)

y_pred = linreg_6.predict(X_test)

print(linreg_6.score(X_train,y_train))
print(r2_score(y_test, y_pred))

#### So yeah, the basic info from the original dataset explains almost nothing. 

#### Want to use the index (sample date) as a feature*. Let's also save this data file as a csv, since it's what we ultimately want to use.
* Was originally attempting to model as time-series

In [None]:
sample_date_df = merged_df_pre_omi.copy()
sample_date_df['Sample Date'] = pd.to_datetime(sample_date_df['Sample Date'])

# sample_date_df.to_csv('data/master_wastewater.csv', index=False)

#(Ending up abandoning dates as features, but in future work would like to integrate them.)

#### Fancier models! Going to move on to more elaborate models, including Random Forest. Re-doing the columns for preprocessing since we have different features


In [None]:
date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        


print(date_cols)
print(cat_cols)
print(num_cols)
# leaving "holiday" out because we don't want to transform this binary data

#### Custom function Allows for fit and transformation of linear features in a pipeline. Were originally trying to combineLinearRegressor with RandomForest using FeatureUnion, but was unsuccessful

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]
    

#### Just going with a vanilla Random Forest model after trying to combine a linear regression and random forest.


In [None]:
# New X and y, split

sample_copy_df = sample_date_df.copy()
sample_copy_df.dropna(inplace=True)

X = sample_copy_df.drop('Gene Copies (N1/L)', axis=1)

y = sample_copy_df['Gene Copies (N1/L)']


X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

# New preprocessors

cat_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessor = Pipeline([
    ('selector', ColumnSelector(columns=num_cols)), 
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_preprocessor, cat_cols),
    ('num', numeric_preprocessor, num_cols),
])


rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  RandomForestRegressor(random_state = 42))
])


rf_pipeline.fit(X_train, y_train)


y_pred = rf_pipeline.predict(X_test)

print(f'R-squared score on training data: {rf_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2): {explained_variance_score(y_test, y_pred)}')

#### Wow, 99.5%!  Let's look at feature importances.

In [None]:
rf_model = rf_pipeline.named_steps['model']
features = []
scores = []

# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, rf_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

In [None]:
cross_validate(rf_pipeline, X_train, y_train, return_train_score=True)

#### Want to try this model again without the date features, since they seem to be over-determining.


In [None]:
sample_copy_df_2 = sample_date_df.copy()

sample_copy_df_2.dropna(inplace=True)

X = sample_copy_df_2.drop(['Gene Copies (N1/L)', 'Sample Date', 'Test Date'], axis=1)

y = sample_copy_df_2['Gene Copies (N1/L)']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        


print(date_cols)
print(cat_cols)
print(num_cols)

In [None]:
# Adding in numeric scaling

cat_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessor = Pipeline([
    ('selector', ColumnSelector(columns=num_cols)), 
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_preprocessor, cat_cols),
    ('num', numeric_preprocessor, num_cols),
])


rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  RandomForestRegressor(random_state = 42))
])



rf_pipeline.fit(X_train, y_train)  # Same pipeline, different data


y_pred = rf_pipeline.predict(X_test)


print(f'R-squared score on training data: {rf_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2): {explained_variance_score(y_test, y_pred)}')

#### The very same! Our engineered features are very strong all around. Let's look at feature importance.


In [None]:
rf_model = rf_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, rf_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

#### Let's try another model type, using all features

In [None]:
sample_copy_df = sample_date_df.copy()

X = sample_copy_df.drop('Gene Copies (N1/L)', axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X.dropna(inplace=True)
y.dropna(inplace=True)

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

grad = GradientBoostingRegressor()

grad_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  grad)
])


grad_pipeline.fit(X_train, y_train)
y_pred = grad_pipeline.predict(X_test)

print(f'R-squared score on training data: {grad_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')

#### Pretty similar!

In [None]:
grad_model = grad_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, grad_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

#### Very different weighing of features here, and slightly better test score. Let's look at loss scores, too:

In [None]:
train_score = grad_pipeline['model'].train_score_
train_score

# If we ran maybe double the iteratations, our loss score should approach .2

#### Let's get rid of all date features with this model and see what happens

In [None]:
sample_copy_df = sample_date_df.copy()

X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date'], axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X.dropna(inplace=True)
y.dropna(inplace=True)

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

grad = GradientBoostingRegressor()
grad_best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  grad)
])

grad_best_pipeline.fit(X_train, y_train)
y_pred = grad_best_pipeline.predict(X_test)

print(f'R-squared score on training data: {grad_best_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')

#### Very similar scores, with less over-fitting. Good! Let's examine feature importance.

In [None]:
grad_model = grad_best_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, grad_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

#### Going to remove highly-correlated non-engineered features, to better see how strong the model is with my contibutions.


In [None]:
sample_copy_df = sample_date_df.copy()

sample_copy_df.dropna(inplace=True)

X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date', 'Population Served, estimated', 'Per Capita Gene Copies'], axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)
y_test = np.log(y_test)

# Re-do preprocesing as we have different features
date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday' or i == 'Population Served, estimated' or i == 'Per Capita Gene Copies':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        


print(date_cols)
print(cat_cols)
print(num_cols)
# leaving "holiday" out because we don't want to transform this binary data

In [None]:
cat_transformer = Pipeline(steps=[  
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))                     
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols),
    ])

grad = GradientBoostingRegressor()
grad_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',  grad)
])

grad_pipeline.fit(X_train, y_train)
y_pred = grad_pipeline.predict(X_test)

print(f'R-squared score on training data: {grad_pipeline.score(X_train, y_train)}')
print(f'R2 test score: {r2_score(y_test, y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')

#### Looking at feature importances

In [None]:
grad_model = grad_pipeline.named_steps['model']
features = []
scores = []
# Print the coefficients along with column names
for feature_name, importance in zip(X.columns, grad_model.feature_importances_):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')

#### Analysis: 

By removing correlated features, our score was reduced dramatically. However, there seem to be important elements of the non-date correlated features that we should keep, because although they are similar, they tell us important things about the data at particular times (as opposed to our scores, which cover the entire data period). For instance, sudden wastewater spikes around school events in places with high school-aged populations. Over time, these effects are expected to flatten out and/or become more cyclical. One avenue to pursue later would be population estimates of those who left the city during 2020/2021 but who were never officially non-residents, and thus not reflected in the Census Bureau's data. We would expect much more of this in wealthier zipcodes and in zipcodes where there are fewer children. 

#### Setting up for grid search of our best model so far. Re-doing preprocessing since using different features


In [None]:
date_cols = []
cat_cols = []  
num_cols = []   

for i in sample_date_df.columns:
    if i == 'Gene Copies (N1/L)' or i == 'Holiday':
        pass
    elif sample_date_df[i].dtype == 'datetime64[ns]':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'object':
        cat_cols.append(i)
    elif i == 'Sample Date':
        date_cols.append(i)
    elif sample_date_df[i].dtype == 'float64' or sample_date_df[i].dtype == 'int64':
        num_cols.append(i)
    else:
        print("error")
        
print(date_cols)
print(cat_cols)
print(num_cols)

cat_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessor = Pipeline([
    ('selector', ColumnSelector(columns=num_cols)), 
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer([
    ('cat', cat_preprocessor, cat_cols),
    ('num', numeric_preprocessor, num_cols),
])


#### Removing the datetime info and running grid search on our best model.


In [None]:
sample_copy_df = sample_date_df.copy()
sample_copy_df.dropna(inplace=True)

X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date'], axis=1)

y = sample_copy_df['Gene Copies (N1/L)']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

y_train = np.log(y_train)

y_test = np.log(y_test)

grad = GradientBoostingRegressor()

param_grid = {
    'model__learning_rate': [.01, .1, .3],
    'model__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'model__n_estimators': [100, 150, 200, 250],
    'model__subsample': [.3, .5, .7, 1.0],
    'model__criterion': ['friedman_mse', 'squared_error']
    
}
                            
grid_search_gbc = GridSearchCV(
    estimator = grad_best_pipeline, 
    param_grid = param_grid,
    cv= 5,
    scoring='explained_variance' 
)

grid_search_gbc.fit(X_train, y_train)

y_pred = grid_search_gbc.predict(X_test)

cv_score = grid_search_gbc.best_score_
test_score = r2_score(y_test, grid_search_gbc.predict(X_test))


print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print(grid_search_gbc.best_params_)

#### Grid Search feature importances

In [None]:
best_model = grid_search_gbc.best_estimator_

model_step = best_model.named_steps['model']
print(model_step)

feature_importances = model_step.feature_importances_

features = []
scores = []

for feature_name, importance in zip(X.columns, feature_importances):
    features.append(feature_name)
    scores.append(importance)
    
ranked_scores = sorted(zip(scores, features), reverse=True)
for score, feature in ranked_scores:
    print(f'{score}: {feature}')


#### Visualize residuals for our best model

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, color='blue')
sns.regplot(x=y_test, y=y_pred, color='red', scatter=False)

plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Pre-Omicron Actual vs. Predicted Values')
plt.show()


# Normalize the residuals
scaler = StandardScaler()
residuals_normalized = scaler.fit_transform(residuals.values.reshape(-1, 1))

# Create a linear regression object
linreg = LinearRegression()

# Fit the linear regression to the data
linreg.fit(y_pred.reshape(-1,1), residuals_normalized)

# Create predicted values for the trendline
trendline = linreg.predict(y_pred.reshape(-1,1))

# Create the plot
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals_normalized, alpha=0.5)
plt.plot(y_pred, trendline, color='red')
plt.title('Normalized Residuals vs Fitted Values')
plt.xlabel('Fitted values')
plt.ylabel('Normalized residuals')
plt.show()


plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Histogram of Residuals (Pre-Omicron)')
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.show()


plt.figure(figsize=(10, 6))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals (Pre-Omicron)')
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(y_pred, np.sqrt(np.abs(residuals)), alpha=0.5)
plt.title('Scale-Location Plot (Pre-Omicron)')
plt.xlabel('Fitted values')
plt.ylabel('sqrt(|standardized residuals|)')
plt.show()


#### Thought I was done, but want to try one more model type: LinearTrees!

In [None]:
# sample_copy_df = sample_date_df.copy()

# sample_copy_df.dropna(inplace=True)

# X = sample_copy_df.drop(columns=['Gene Copies (N1/L)','Sample Date', 'Test Date'], axis=1)

# y = sample_copy_df['Gene Copies (N1/L)']

# X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


# y_train = np.log(y_train)
# y_test = np.log(y_test)

In [None]:
# lintree = LinearTreeRegressor(base_estimator=LinearRegression())

# lintree_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('to_sparse', FunctionTransformer(csr_matrix, validate=False)),  # convert to sparse
#     ('to_dense', FunctionTransformer(lambda x: x.toarray(), validate=False)), # convert to dense. lintree requires this
#     ('model',  lintree)
# ])

# lintree_pipeline.fit(X_train, y_train)

# y_pred = lintree_pipeline.predict(X_test)

# print(f'R-squared score on training data: {lintree_pipeline.score(X_train, y_train)}')
# print(f'R2 test score: {r2_score(y_test, y_pred)}')
# print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
# print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
# print(f'Mean absolute percentage error: {mean_absolute_percentage_error(y_test, y_pred)}')
# print(f'Explained variance score (modified R2 test score): {explained_variance_score(y_test, y_pred)}')



### Well, that was a terrible model for this data, but seemed interesting! 