In [1]:
import os
import pandas as pd

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [2]:
base = "/home/george/code/Fitness/metadata/linear_regression"

df = pd.read_parquet("/home/george/code/Fitness/metadata/linear_regression/02-08-Dec-2019-daily-binary-mon.parquet")
df = df.fillna(0)
df = df.set_index(["caid"])
df = df.drop(columns = ["Fitness and Recreational Sports Centers", "true_state", "day"])

In [3]:
X = add_constant(df)
print(pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns))

const                                                         3.578879
Hardware Stores                                               1.005189
All Other General Merchandise Stores                          1.022057
Full-Service Restaurants                                      1.036892
Supermarkets and Other Grocery (except Convenience) Stores    1.017329
Limited-Service Restaurants                                   1.018819
Pet and Pet Supplies Stores                                   1.002205
Nature Parks and Other Similar Institutions                   1.034654
Religious Organizations                                       1.012466
Gasoline Stations with Convenience Stores                     1.008703
Snack and Nonalcoholic Beverage Bars                          1.009686
Used Merchandise Stores                                       1.001898
Convenience Stores                                            1.001535
Sporting Goods Stores                                         1.001797
Colleg

In [1]:
import statsmodels.api as sm
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import StratifiedKFold

In [2]:
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
        return self
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)


    def summary(self):
        print(self.results_.summary() )


In [3]:
base = "/home/george/code/Fitness/metadata/linear_regression"
files = os.listdir(base)
files = [os.path.join(base, file) for file in files]

file_0 = pd.read_parquet("/home/george/code/Fitness/metadata/linear_regression/07-13-Oct-2019-daily-binary-mon.parquet")
file_1 = pd.read_parquet("/home/george/code/Fitness/metadata/linear_regression/07-13-Oct-2019-daily-binary-tue.parquet")

In [6]:
df = pd.concat([file_0, file_1])
df = df.fillna(0)
df = df.set_index(["caid"])
df = pd.get_dummies(df, columns=['day', 'true_state'], drop_first=True)

In [7]:
X = df.loc[:, df.columns != "Fitness and Recreational Sports Centers"]
Y = df['Fitness and Recreational Sports Centers']
X = sm.add_constant(X)

In [10]:
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.linear_model import LinearRegression

In [12]:
auc_scorer = make_scorer(roc_auc_score, needs_proba=False)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
auc_scores = cross_val_score(LinearRegression(), X.astype('float32'), Y.astype('float32'), cv=skf, scoring=auc_scorer)



In [13]:
auc_scores

array([0.702355  , 0.70275228, 0.70303648, 0.70332259, 0.70265893])