In [1]:
import sys

if '.' not in sys.path:
    sys.path.insert(0, '.')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from pymongo import MongoClient
from urllib.parse import quote_plus
from utils.summarize import (
    GLMEstimatorSummary,
    FeatureSummary
)
from typing import List, Dict
from sklearn.metrics import explained_variance_score
from sklearn import datasets


ModuleNotFoundError: No module named 'utils.summary'

# Save N Training Data Sets

In [51]:
models = {}

X, y, coef_sklearn = datasets.make_regression(
    n_samples=1000000, n_features=20, n_informative=5, noise=1, coef=True
)


In [52]:
data = pd.DataFrame(
    np.append(X, y.reshape(y.shape[0], -1), axis=1),
    columns=[*[f"X{i}" for i in range(X.shape[1])], "y"],
)

# save names of features with at least .1 correlation with y
features = list(data.corr().loc["y"].pipe(lambda s: s[s > 0.1]).index)

# store name of target column
target = features.pop()

# add record weights
data["weight"] = 1


In [53]:
data.to_parquet('./data/data_set_5.parquet')

# Create Model Summary Payload

In [57]:
data = pd.read_parquet('./data/data_set_1.parquet')

# save names of features with at least .1 correlation with y
features = list(data.corr().loc["y"].pipe(lambda s: s[s > 0.1]).index)

# store name of target column
target = features.pop()

In [58]:
# define the conditional mean of the response (E[Y|X] = `formula`)
formula = f"{target} ~ 0 + {'+'.join(features)}"

# get design matrix
y, D = dmatrices(formula, data=data)

In [59]:
# fit model
linreg = LinearRegression(fit_intercept=False).fit(
    X=D,
    y=y.ravel(),
)

# attach predictions to our data

try:
    data['prediction'] = linreg.predict(D)
    del D, y
except:
    pass


In [60]:
# obtain list of feature summaries

summary: List[FeatureSummary] = []
colnames = {
    "feature": None,
    "target": target,
    "prediction": "prediction",
    "weight": "weight",
}
for feature in features:
    colnames["feature"] = feature
    summary.append(
        {"name": feature, "data": get_feature_summary_data(df=data, **colnames)}
    )
    print(f'finished summarizing {feature}')

finished summarizing X1
finished summarizing X4
finished summarizing X5
finished summarizing X11
finished summarizing X12
finished summarizing y


In [62]:
# store model details in list

# identifiers
desc = {
    "name": "model A",
    "desc": formula,
    "target": "y",
    "prediction": "prediction",
}

# model error structure
error = {
    "var_weights": "weight",
    "link_function": "identity",
    "error_dist": "gaussian",
}

# scoring
scores = {
    "explained_variance": explained_variance_score(data['y'], data['prediction'])
}

# add to our set of models if this model does not exist
if not models.get(desc["name"]):
    models[desc["name"]] = (GLMEstimatorSummary[str](**desc, **error, **scores, feature_summary=summary))



In [40]:
# username = quote_plus("root")
# password = quote_plus("OTNmYTdjYmZkMjE5ZmYzODg0MDZiYWJh")
# uri = (
#     f"mongodb://{username}:{password}@localhost:27017/?uuidRepresentation=standard"
# )
# client = MongoClient(uri)

# mydb = client['models']

# # get collection
# clcn = mydb['models']

# clcn.find_one()

{'_id': ObjectId('61c15466fceee143e9f0bcdc'),
 'uuid': UUID('54ec40fa-d0fe-4c60-851b-72d69a44053e'),
 'created_time': 1640060006.702961,
 'name': 'model A',
 'desc': 'y ~ 0 + X1+X4+X5+X11+X12',
 'target': 'y',
 'prediction': 'prediction',
 'var_weights': 'weight',
 'link_function': 'identity',
 'error_dist': 'gaussian',
 'feature_summary': [{'name': 'X1',
   'data': {'sum_target': [-562.0633149869575,
     -1975.9327705243581,
     -4562.732260118195,
     -10412.674356306838,
     -16324.510725989068,
     -33489.012450070346,
     -67240.97845810017,
     -118728.6307971745,
     -210658.07319466502,
     -344012.01216306037,
     -558867.2450880862,
     -851346.3601409114,
     -1210912.3061249782,
     -1723784.8243024163,
     -2260708.5076381825,
     -2764463.7586278473,
     -3276515.5515575805,
     -3639756.647518958,
     -3787398.249324983,
     -3661601.2630464872,
     -3166806.441305351,
     -2372215.3214719235,
     -1287430.1975848053,
     -106024.66050607362,
     