# StateFarm MLE Take Home

Start Time: 10:30am

End Time:  

In [None]:
import numpy as np
import pandas as pd
import sys
import statsmodels.api as sm
import bokeh
import collections as ccc
import logging

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import logging

date_strftime_format = "%Y-%m-%y %H:%M:%S"
logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt=date_strftime_format,
)


class DataSplitter:
    """
    A class for splitting data into training, validation, and optionally test sets.

    Attributes:
        df (pandas.DataFrame): The complete dataset.
        y_vars (list of str): Column names of the target variables.
        splits (list of lists): Indices for training, validation, and test sets.
        X_train, X_valid, X_test (pandas.DataFrame): Training, validation, and test features.
        y_train, y_valid, y_test (pandas.DataFrame): Training, validation, and test targets.
    """

    def __init__(self, df, y_vars):
        self.df = df
        self.y_vars = y_vars
        self.splits = {"train": None, "valid": None, "test": None}
        self.X_train = self.X_valid = self.X_test = None
        self.y_train = self.y_valid = self.y_test = None

    def split_data(
        self, test_size=0.2, val_size=0.1, random_state=13, create_test_set=False
    ):
        """
        Splits the dataset into training, validation, and optionally test sets.

        The method first splits the dataset into training plus temporary set and validation set.
        If a test set is requested, it further splits the training plus temporary set into
        the final training set and test set. The 'test_size' parameter can be either a float
        to represent the proportion of the dataset to include in the test split or an
        absolute number of samples.

        Parameters:
            test_size (float or int): If float, represents the proportion of the dataset
                                      to include in the test split. If int, represents
                                      the absolute number of samples to include in the test split.
            val_size (float): Proportion of the dataset to include in the validation split.
            random_state (int): Controls the shuffling applied to the data before applying the split.
            create_test_set (bool): Whether to create a separate test set. If False,
                                    the test set-related attributes (X_test, y_test) remain None.

        Note:
            - The 'test_size' is interpreted as a proportion if it is a float less than 1,
              otherwise, it is interpreted as the absolute number of samples.
            - The actual size of the test set might be slightly different from the specified 'test_size'
              when it is given as a proportion, due to rounding.
        """
        x_train_full, x_val, y_train_full, y_val = train_test_split(
            self.df.drop(columns=self.y_vars),
            self.df[self.y_vars],
            test_size=val_size,
            random_state=random_state,
        )

        if create_test_set:
            x_train, x_test, y_train, y_test = train_test_split(
                x_train_full,
                y_train_full,
                test_size=test_size,
                random_state=random_state,
            )
            self.splits["test"] = (x_test.index, y_test.index)
        else:
            x_train, y_train = x_train_full, y_train_full

        self.splits["train"] = (x_train.index, y_train.index)
        self.splits["valid"] = (x_val.index, y_val.index)

        self._extract_splits()

    def _extract_splits(self):
        """Internal method to extract features and targets for each set based on splits."""
        self.X_train, self.y_train = (
            self.df.drop(columns=self.y_vars).iloc[self.splits["train"][0]],
            self.df[self.y_vars].iloc[self.splits["train"][1]],
        )
        self.X_valid, self.y_valid = (
            self.df.drop(columns=self.y_vars).iloc[self.splits["valid"][0]],
            self.df[self.y_vars].iloc[self.splits["valid"][1]],
        )

        logging.info(f"Training set: {self.X_train.shape}, {self.y_train.shape}")
        logging.info(f"Validation set: {self.X_valid.shape}, {self.y_valid.shape}")

        if self.splits["test"]:
            self.X_test, self.y_test = (
                self.df.drop(columns=self.y_vars).iloc[self.splits["test"][0]],
                self.df[self.y_vars].iloc[self.splits["test"][1]],
            )
            logging.info(f"Test set: {self.X_test.shape}, {self.y_test.shape}")
        else:
            logging.info("No test set created.")


class DataPreprocessor:
    """
    A class for preprocessing data for machine learning tasks.

    This class handles the conversion of monetary and percentage string values to floats,
    imputation of missing values, scaling of features, and the creation of dummy variables
    for categorical columns. It can be used for both fitting and transforming training data,
    as well as transforming new data with the same transformations applied to the training data.

    Attributes:
        columns_to_convert (list of str): Columns that contain monetary or percentage string values.
        columns_to_impute (list of str): Columns for which missing values will be imputed.
        columns_to_dummy (list of str): Categorical columns to be converted to dummy variables.
        target_column (str, optional): The name of the target variable column. Default is None.
        imputer (SimpleImputer): The imputer object used for missing value imputation.
        scaler (StandardScaler): The scaler object used for feature scaling.
        dummy_columns (dict): A dictionary to store the columns created after dummy encoding.

    Methods:
        fit_transform(df): Fits the preprocessor to the data and transforms the data.
        transform(df): Transforms a new dataset using the transformations fitted on the training data.
    """

    def __init__(
        self,
        columns_to_convert,
        columns_to_impute,
        columns_to_dummy,
        target_column=None,
    ):
        """
        Initializes the DataPreprocessor with specified columns for conversion, imputation,
        dummy variable creation, and optionally a target column.

        Parameters:
            columns_to_convert (list of str): Columns with monetary or percentage string values to convert.
            columns_to_impute (list of str): Columns for which missing values will be imputed.
            columns_to_dummy (list of str): Categorical columns to be converted into dummy variables.
            target_column (str, optional): The name of the target variable column. Default is None.
        """
        self.columns_to_convert = columns_to_convert
        self.columns_to_impute = columns_to_impute
        self.columns_to_dummy = columns_to_dummy
        self.target_column = target_column
        self.imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
        self.scaler = StandardScaler()
        self.dummy_columns = {}

    def _convert_columns(self, df):
        """
        Converts monetary and percentage values in specified columns from string to float.

        Parameters:
            df (pandas.DataFrame): The dataframe to process.

        Returns:
            pandas.DataFrame: The dataframe with converted columns.
        """

        for col in self.columns_to_convert:
            if df[col].dtype == object:
                df[col] = (
                    df[col]
                    .replace(
                        {r"\$": "", ",": "", "%": "", r"\(": "-", r"\)": ""}, regex=True
                    )
                    .astype(float)
                )
            else:
                logging.warning(
                    f"Column {col} is not of string type and will not be converted."
                )
        return df

    def fit_transform(self, df):
        """
        Fits the preprocessor to the data and transforms the data.
        This includes converting specified columns, imputing missing values,
        scaling, and creating dummy variables.

        Parameters:
            df (pandas.DataFrame): The training dataset to fit and transform.

        Returns:
            pandas.DataFrame: The transformed dataframe.
        """
        df = self._convert_columns(df)
        columns_to_drop = self.columns_to_dummy[:]
        if self.target_column and self.target_column in df.columns:
            columns_to_drop.append(self.target_column)

        df_imputed = pd.DataFrame(
            self.imputer.fit_transform(df.drop(columns=columns_to_drop)),
            columns=df.drop(columns=columns_to_drop).columns,
            index=df.index,
        )
        df_imputed_std = pd.DataFrame(
            self.scaler.fit_transform(df_imputed),
            columns=df_imputed.columns,
            index=df.index,
        )

        for col in self.columns_to_dummy:
            dummies = pd.get_dummies(
                df[col], drop_first=True, prefix=col, prefix_sep="_", dummy_na=True
            )
            dummies = dummies.reindex(df.index, fill_value=0)
            self.dummy_columns[col] = dummies.columns.tolist()
            df_imputed_std = pd.concat([df_imputed_std, dummies], axis=1, sort=False)

        return df_imputed_std

    def transform(self, df):
        """
        Transforms a dataset using the transformations fitted on the training data.
        This includes converting specified columns, imputing missing values,
        scaling, and creating dummy variables based on the training data.

        Parameters:
            df (pandas.DataFrame): The new dataset to transform.

        Returns:
            pandas.DataFrame: The transformed dataframe.
        """
        df = self._convert_columns(df)
        columns_to_drop = self.columns_to_dummy[:]
        if self.target_column and self.target_column in df.columns:
            columns_to_drop.append(self.target_column)
        df_imputed = pd.DataFrame(
            self.imputer.transform(df.drop(columns=columns_to_drop)),
            columns=df.drop(columns=columns_to_drop).columns,
            index=df.index,
        )
        df_imputed_std = pd.DataFrame(
            self.scaler.transform(df_imputed),
            columns=df_imputed.columns,
            index=df.index,
        )

        for col in self.columns_to_dummy:
            dummies = pd.get_dummies(
                df[col], drop_first=True, prefix=col, prefix_sep="_", dummy_na=True
            )
            dummies = dummies.reindex(columns=self.dummy_columns[col], fill_value=0)
            df_imputed_std = pd.concat([df_imputed_std, dummies], axis=1, sort=False)

        return df_imputed_std


class LogisticRegressionAnalysis:
    """
    A class for conducting logistic regression analysis.

    This class simplifies the process of fitting a logistic regression model,
    selecting important variables based on coefficients, and evaluating the model's
    performance using the C-statistic (ROC AUC score).

    Attributes:
        exploratory_LR (LogisticRegression): The initial logistic regression model.
        variables (list): List of selected variables based on the model's coefficients.
        final_model (statsmodels.Logit): The final logistic regression model after variable selection.
        final_result (statsmodels.LogitResults): Results of the final logistic regression model.
    """

    def __init__(self):
        """Initializes the LogisticRegressionAnalysis class with default values."""
        self.exploratory_LR = None
        self.variables = []
        self.final_model = None
        self.final_result = None

    def fit_exploratory_model(self, df, target_column):
        """
        Fits an exploratory logistic regression model to identify important variables.

        Args:
            df (pandas.DataFrame): The dataset to fit the model on.
            target_column (str): The name of the target variable in the dataset.

        Returns:
            list: The list of top 25 variables selected based on the coefficients.
        """
        self.exploratory_LR = LogisticRegression(
            penalty="l1", fit_intercept=False, solver="liblinear"
        )
        self.exploratory_LR.fit(df.drop(columns=[target_column]), df[target_column])

        # Extract coefficients
        results = pd.DataFrame(df.drop(columns=[target_column]).columns).rename(
            columns={0: "name"}
        )
        results["coefs"] = self.exploratory_LR.coef_[0]
        results["coefs_squared"] = results["coefs"] ** 2

        # Select top 25 variables
        self.variables = results.nlargest(25, "coefs_squared")["name"].tolist()

        logging.info("Selected Variables: %s", self.variables)
        return self.variables

    def fit_final_model(self, df, target_column):
        """
        Fits the final logistic regression model using selected variables.

        Args:
            df (pandas.DataFrame): The dataset to fit the model on.
            target_column (str): The name of the target variable in the dataset.

        Returns:
            str: The summary of the final logistic regression model.
        """
        self.final_model = sm.Logit(df[target_column], df[self.variables])
        self.final_result = self.final_model.fit()

        logging.info(self.final_result.summary())
        return self.final_result.summary()

    def evaluate_model(self, df, target_column):
        """
        Evaluates the model's performance using the C-statistic (ROC AUC score).

        Args:
            df (pandas.DataFrame): The dataset to evaluate the model on.
            target_column (str): The name of the target variable in the dataset.

        Returns:
            pandas.Series: Sum of target variable in each probability bin.
        """
        outcomes = pd.DataFrame(self.final_result.predict(df[self.variables])).rename(
            columns={0: "probs"}
        )
        outcomes["y"] = df[target_column]

        roc_auc = roc_auc_score(outcomes["y"], outcomes["probs"])
        logging.info("The C-Statistics is %s", roc_auc)

        outcomes["prob_bin"] = pd.qcut(outcomes["probs"], q=20)
        grouped_outcomes = outcomes.groupby(["prob_bin"])["y"].sum()
        return outcomes, grouped_outcomes


train_val = pd.read_csv("statefarm/files/data/exercise_26_train.csv")

data_splitter = DataSplitter(df=train_val, y_vars=["y"])
data_splitter.split_data(
    test_size=4000, val_size=0.1, random_state=13, create_test_set=True
)

X_train, y_train = data_splitter.X_train, data_splitter.y_train
X_valid, y_valid = data_splitter.X_valid, data_splitter.y_valid
X_test, y_test = data_splitter.X_test, data_splitter.y_test

columns_to_convert = ["x12", "x63"]
columns_to_impute = [
    col
    for col in train_val.columns
    if col not in ["y", "x5", "x31", "x81", "x82"] + columns_to_convert
]
columns_to_dummy = ["x5", "x31", "x81", "x82"]

preprocessor = DataPreprocessor(
    columns_to_convert, columns_to_impute, columns_to_dummy, target_column="y"
)
X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_test = preprocessor.transform(X_test)

train_df = pd.concat([X_train, y_train], axis=1, sort=False).reset_index(drop=True)
valid_df = pd.concat([X_valid, y_valid], axis=1, sort=False).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1, sort=False).reset_index(drop=True)

logging.basicConfig(level=logging.INFO)
logging.info("Data Set Sizes After Preprocessing..")
logging.info(f"Training set: {train_df.shape}")
logging.info(f"Validation set: {valid_df.shape}")
logging.info(f"Test set: {test_df.shape}")

lr_analysis = LogisticRegressionAnalysis()
important_variables = lr_analysis.fit_exploratory_model(train_df, "y")
combined_df = pd.concat([train_df, valid_df, test_df])
final_model_summary = lr_analysis.fit_final_model(combined_df, "y")
result, evaluation_result = lr_analysis.evaluate_model(combined_df, "y")

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


2023-11-23 13:47:28 Training set: (32000, 100), (32000, 1)
2023-11-23 13:47:28 Validation set: (4000, 100), (4000, 1)
2023-11-23 13:47:28 Test set: (4000, 100), (4000, 1)
2023-11-23 13:47:29 Data Set Sizes After Preprocessing..
2023-11-23 13:47:29 Training set: (32000, 122)
2023-11-23 13:47:29 Validation set: (4000, 122)
2023-11-23 13:47:29 Test set: (4000, 122)
2023-11-23 13:47:29 Selected Variables: ['x5_saturday', 'x81_July', 'x81_December', 'x31_japan', 'x81_October', 'x5_sunday', 'x31_asia', 'x81_February', 'x91', 'x81_May', 'x5_monday', 'x81_September', 'x81_March', 'x53', 'x81_November', 'x44', 'x81_June', 'x12', 'x5_tuesday', 'x81_August', 'x81_January', 'x62', 'x31_germany', 'x58', 'x56']
Optimization terminated successfully.
         Current function value: 0.536475
         Iterations 6
2023-11-23 13:47:30                            Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                40000
Model:       

In [None]:
columns_to_impute

['x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x6',
 'x7',
 'x8',
 'x9',
 'x10',
 'x11',
 'x13',
 'x14',
 'x15',
 'x16',
 'x17',
 'x18',
 'x19',
 'x20',
 'x21',
 'x22',
 'x23',
 'x24',
 'x25',
 'x26',
 'x27',
 'x28',
 'x29',
 'x30',
 'x32',
 'x33',
 'x34',
 'x35',
 'x36',
 'x37',
 'x38',
 'x39',
 'x40',
 'x41',
 'x42',
 'x43',
 'x44',
 'x45',
 'x46',
 'x47',
 'x48',
 'x49',
 'x50',
 'x51',
 'x52',
 'x53',
 'x54',
 'x55',
 'x56',
 'x57',
 'x58',
 'x59',
 'x60',
 'x61',
 'x62',
 'x64',
 'x65',
 'x66',
 'x67',
 'x68',
 'x69',
 'x70',
 'x71',
 'x72',
 'x73',
 'x74',
 'x75',
 'x76',
 'x77',
 'x78',
 'x79',
 'x80',
 'x83',
 'x84',
 'x85',
 'x86',
 'x87',
 'x88',
 'x89',
 'x90',
 'x91',
 'x92',
 'x93',
 'x94',
 'x95',
 'x96',
 'x97',
 'x98',
 'x99']

In [None]:
import joblib

In [None]:
import json

test = pd.read_csv("statefarm/files/data/exercise_26_test.csv")
MODEL_PATH = "statefarm/files/models/logistic_regression_model.pkl"
PREPROCESSOR_PATH = "statefarm/files/models/preprocessor.pkl"

model = joblib.load(MODEL_PATH)
preprocessor = joblib.load(PREPROCESSOR_PATH)

# Selecting the first two rows and dropping the target column 'y'
sample_data = test.head(10)

In [None]:
input_df = pd.DataFrame([json.loads(sample_data.iloc[0].to_json())])

response = pd.DataFrame(
    model.final_result.predict(preprocessor.transform(sample_df)[model.variables])
).rename(columns={0: "phat"})
np.where(response["phat"] >= 0.75, 1, 0)

response["business_outcome"] = np.where(response["phat"] >= 0.75, 1, 0)

response = response.to_dict(orient="records")[0]
response

2023-11-23 14:27:40 Column x12 is not of string type and will not be converted.
2023-11-23 14:27:40 Column x63 is not of string type and will not be converted.


{'phat': 0.34711484677709, 'business_outcome': 0}

In [None]:
sample_json_batch = sample_data.to_json(orient="records")

In [None]:
response['business_outcome'] = # where Phat >= 0.75

2023-11-23 14:19:00 Column x12 is not of string type and will not be converted.
2023-11-23 14:19:00 Column x63 is not of string type and will not be converted.


'{"phat":{"0":0.3471148468},"variable_name":{"0":"business_outcome"}}'

In [None]:
# Find columns with null or missing values
is_null_df = test.isnull()
columns_with_null = is_null_df.any(axis=0)
columns_with_null_values = test.columns[columns_with_null]

print(columns_with_null_values)

Index(['x0', 'x1', 'x2', 'x3', 'x4', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11',
       'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22',
       'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32',
       'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42',
       'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49', 'x50', 'x51', 'x52',
       'x53', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59', 'x60', 'x61', 'x62',
       'x64', 'x65', 'x66', 'x67', 'x68', 'x69', 'x70', 'x71', 'x72', 'x73',
       'x74', 'x75', 'x76', 'x77', 'x78', 'x79', 'x80', 'x83', 'x84', 'x85',
       'x86', 'x87', 'x88', 'x89', 'x90', 'x91', 'x92', 'x93', 'x94', 'x95',
       'x96', 'x97', 'x98', 'x99'],
      dtype='object')


In [None]:
test["x5"].unique()

array(['monday', 'tuesday', 'friday', 'saturday', 'sunday', 'thursday',
       'wednesday'], dtype=object)

In [None]:
test = pd.read_csv("statefarm/files/data/exercise_26_test.csv")

# Selecting the first two rows and dropping the target column 'y'
sample_data = test.head(10)

# Convert these rows to JSON
sample_json_single = sample_data.iloc[0].to_json()
sample_json_batch = sample_data.to_json(orient="records")
sample_df = pd.DataFrame([json.loads(sample_json_single)])
sample_json_single

'{"x0":0.042317,"x1":-3.344721,"x2":4.6351242122,"x3":-0.5983959993,"x4":-0.6477715046,"x5":"monday","x6":0.184902,"x7":46.690015,"x8":3.034132,"x9":0.364704,"x10":14.260733,"x11":-1.559332,"x12":"$5,547.78","x13":0.520324,"x14":31.212255,"x15":4.891671,"x16":0.357763,"x17":14.766366,"x18":-17.467243,"x19":0.224628,"x20":0.096752,"x21":1.305564,"x22":0.353632,"x23":3.909028,"x24":-91.273052,"x25":1.396952,"x26":4.401593,"x27":0.443086,"x28":14.048787,"x29":-0.932243,"x30":5.255472,"x31":"germany","x32":0.54199153,"x33":2.98948039,"x34":-1.78334189,"x35":0.80127315,"x36":-2.60231221,"x37":3.39682926,"x38":-1.22322646,"x39":-2.20977636,"x40":-68.69,"x41":522.25,"x42":-428.69,"x43":381.37,"x44":0.0197503,"x45":0.75116479,"x46":0.8630479008,"x47":-1.0383166613,"x48":-0.2726187635,"x49":-0.3430207259,"x50":0.3109008666,"x51":-0.797841974,"x52":-2.0390175153,"x53":0.87182889,"x54":0.14373012,"x55":-1.15212514,"x56":-2.1703139704,"x57":-0.267842962,"x58":0.212110633,"x59":1.6926559407,"x60":-

In [None]:
response = pd.DataFrame(
    model.final_result.predict(preprocessor.transform(sample_df)[model.variables])
).rename(columns={0: "phat"})
response["variable_name"] = "business_outcome"

# Preparing the final response
response = {
    **sample_df.to_dict(orient="records")[0],
    "business_outcome": prediction[0],
    "phat": probability[0],
}

# Sorting the keys alphabetically
response = dict(sorted(response.items()))

response

NameError: name 'prediction' is not defined

In [None]:
MODEL_PATH = "statefarm/files/models/logistic_regression_model.pkl"
PREPROCESSOR_PATH = "statefarm/files/models/preprocessor.pkl"

model = joblib.load(MODEL_PATH)
preprocessor = joblib.load(PREPROCESSOR_PATH)

In [None]:
model.final_result.predict(preprocessor.transform)

In [None]:
train_val = pd.read_csv("statefarm/files/data/exercise_26_train.csv")

train_val

## Debrief
In the final discussion with the business partner, the partner was thrilled with the rank-order ability of the model.  Based on a combination of capacity and accuracy, the partner would like to classify any observation that would fall in the top 5 bins as an event; for simplicity we will say the cutoff is at the 75th percentile.  For the API, please return the predicted outcome (variable name is business_outcome), predicted probability (variable name is phat), and all model inputs; the variables should be returned in alphabetical order in the API return.

In [None]:
# Overview of data types
print("object dtype:", raw_train.columns[raw_train.dtypes == "object"].tolist())
print("int64 dtype:", raw_train.columns[raw_train.dtypes == "int"].tolist())
print("The rest of the columns have float64 dtypes.")

# Investigate Object Columns
def investigate_object(df):
    """
    This function prints the unique categories of all the object dtype columns.
    It prints '...' if there are more than 13 unique categories.
    """
    col_obj = df.columns[df.dtypes == "object"]

    for i in range(len(col_obj)):
        if len(df[col_obj[i]].unique()) > 13:
            print(
                col_obj[i] + ":",
                "Unique Values:",
                np.append(df[col_obj[i]].unique()[:13], "..."),
            )
        else:
            print(col_obj[i] + ":", "Unique Values:", df[col_obj[i]].unique())

    del col_obj


investigate_object(raw_train)