In [0]:
%pip install scikit-learn==1.1.0

In [0]:
from pyspark.sql.functions import countDistinct, col, sum as sum_, concat, lit, avg, when
import numpy as np
from datetime import timedelta 
import pandas as pd
from pyspark.sql.types import IntegerType, StringType
from datetime import datetime
from collections import defaultdict
from multiprocessing import Pool

from collections import Counter
from nltk.corpus import stopwords
import nltk
import heapq
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import OneHotEncoder

import statsmodels.api as sm


In [0]:
def comparison_mode(month_to_compare, test_end_date):
  test_end_date = datetime.strptime(test_end_date, '%Y-%m-%d')
  test_start_date = (test_end_date - timedelta(days=29)).strftime("%Y-%m-%d")
  train_start_date, train_end_date = None, None
  train_end_date = (test_end_date - timedelta(days=30*month_to_compare)).strftime("%Y-%m-%d")
  train_start_date = (datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=29)).strftime("%Y-%m-%d")
  return train_start_date, train_end_date, test_start_date, test_end_date

In [0]:
def preprocess_income(income):
  if income == '0-40k':
    return 1
  elif income == '40k-100k':
    return 2
  elif income == '100k-200k':
    return 3
  elif income == '200k+':
    return 4
  else:
    return 
    
def process_gender(gender):
  if gender == 'MALE':
    return 1
  elif gender == 'FEMALE':
    return -1
  else: 
    return 0

def process_urbanization(urbanization):
  if urbanization == 'urban':
    return 1
  elif urbanization == 'rural':
    return -1
  else: 
    return 0

def process_presence_of_children(children):
  if children == 'no_children':
    return 0
  elif children == "1_child":
    return 1
  elif children == '2-3_children':
    return 2
  elif children == '>3_children':
    return 3
  else: 
    return 0

def process_education_level(level):
  if level == 'BelowHighSchool':
    return 1 # Less than a high school degree
  elif level == 'HighSchool':
    return 2 # High school degree
  elif level == 'AssociateOrBachelor':
    return 3 # Associate Degree or Bachelor’s degree
  elif level == 'MasterOrDoctor':
    return 4 # Master’s degree oor Doctorate's degree
  else:
    return 0
  
def process_marital_status(level):
  if level == 'LiveAlone':
    return 1  # Live Alone
  elif level == 'LiveWithPartners':
    return 2  # Live with Partners
  else:
    return 0
  
def process_age(level):
  if level == '<28':
    return 1
  elif  level == '28-40':
    return 2
  elif level == '41-58':
    return 3
  elif level == '>58':
    return 4
  return 2

def determine_ds(ds, train_start_date, test_start_date, test_end_date, month_to_compare):
  train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d').date()
  test_start_date = datetime.strptime(test_start_date, '%Y-%m-%d').date()
  if ds >= train_start_date and ds < test_start_date:
    return str(train_start_date.year) + '-' + str(train_start_date.month)
  else:
    return str(test_end_date.year) + '-' + str(test_end_date.month)
  
preprocess_income_UDF = udf(lambda z: preprocess_income(z), IntegerType())  
process_gender_UDF = udf(lambda z: process_gender(z), IntegerType())  
process_urbanization_UDF = udf(lambda z: process_urbanization(z), IntegerType())  
process_education_level_UDF = udf(lambda z: process_education_level(z), IntegerType())  
process_marital_status_UDF = udf(lambda z: process_marital_status(z), IntegerType())  
process_presence_of_children_UDF = udf(lambda z: process_presence_of_children(z), IntegerType())  
process_age_UDF = udf(lambda z: process_age(z), IntegerType())  
ds_UDF = udf(lambda z, t1, t2, t3, m: determine_ds(z, t1, t2, t3, m), StringType())  

In [0]:
dashboard_demo_data_delta_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/dashboard_demo_data"
demo_data_df = spark.read.option("header", "true").format("delta").load(dashboard_demo_data_delta_file_path)

processed_demo_data_df = demo_data_df.withColumn('household_income', preprocess_income_UDF(col('household_income')))
processed_demo_data_df = processed_demo_data_df.withColumn('gender', process_gender_UDF(col('gender')))
processed_demo_data_df = processed_demo_data_df.withColumn('age', process_age_UDF(col('age')))
processed_demo_data_df = processed_demo_data_df.withColumn('education_level', process_education_level_UDF(col('education_level')))
processed_demo_data_df = processed_demo_data_df.withColumn('urbanization', process_urbanization_UDF(col('urbanization')))
processed_demo_data_df = processed_demo_data_df.withColumn('marital_status', process_marital_status_UDF(col('marital_status')))
processed_demo_data_df = processed_demo_data_df.withColumn('presence_of_children', process_presence_of_children_UDF(col('presence_of_children')))

In [0]:
def create_standard_data(temp_df, columns_to_encode, numerical_columns, factor_column, target_column):
  # onehot encoding
  categorical_encoder = OneHotEncoder(drop='if_binary')
  encoded_data = categorical_encoder.fit_transform(temp_df[columns_to_encode])
  # create training data
  coded_categorical_df = pd.DataFrame(encoded_data.toarray(),columns=categorical_encoder.get_feature_names_out(columns_to_encode)).reset_index(drop=True)
  coded_numerical_df = temp_df[numerical_columns].reset_index(drop=True)
  factor_df = temp_df[factor_column].reset_index(drop=True)
  train_X = pd.concat([coded_numerical_df, coded_categorical_df, factor_df], axis=1)
  # create variables for interactive effect
  for name in train_X.columns:
    if name != factor_column:
      train_X[name+'_'+factor_column] = train_X[name]*train_X[factor_column]
  y = temp_df[target_column].reset_index(drop=True)
  return train_X, y

In [0]:
def aggregate_importance_score(columns, importances_score, numerical_columns, columns_to_encode, factor_column):
  scores = defaultdict(int)
  for col_, score in zip(columns, importances_score):
    find_categorical = False
    for each in columns_to_encode:
      if each in col_ and factor_column not in col_:
        scores[each] = max(score, scores[each])
        find_categorical = True
        break
      elif each in col_ and factor_column in col_:
        scores[each+'_'+factor_column] = max(score, scores[each+'_'+factor_column])
        find_categorical = True
        break
    if not find_categorical and col_ != 'const':
      scores[col_] = score
  return scores

def lr_aggregate_importance_score(columns, importances_score, numerical_columns, columns_to_encode, factor_column):
  scores = defaultdict(list)
  for col_, score in zip(columns, importances_score):
    find_categorical = False
    for each in columns_to_encode:
        if each in col_ and factor_column not in col_:
          scores[each].append(score)
          find_categorical = True
          break
        elif each in col_ and factor_column in col_:
          scores[each+'_'+factor_column].append(score)
          find_categorical = True
          break
    if not find_categorical and col_ != 'const':
      scores[col_].append(score)
  return scores
  
def lr_aggregate_p_value(columns, p_value_scores, numerical_columns, columns_to_encode, factor_column):
  p_values = defaultdict(list)
  values_significant_dict = defaultdict(list)
  for col_, p_value in zip(columns, p_value_scores):
    find_categorical = False
    for each in columns_to_encode:
        if each in col_ and factor_column not in col_:
          p_values[each].append(p_value)
          values_significant_dict[each].append(col_)
          find_categorical = True
          break
        elif each in col_ and factor_column in col_:
          p_values[each+'_'+factor_column].append(p_value)
          values_significant_dict[each+'_'+factor_column].append(col_)
          find_categorical = True
          break
    if not find_categorical and col_ != 'const':
      p_values[col_].append(p_value)
      values_significant_dict[col_].append(col_)
  return p_values, values_significant_dict
  
def finalize_importance_score_and_p_value(scores, p_values, values_significant, threshold):
  final_scores = defaultdict(int)
  final_p_values = defaultdict(int)
  final_values_significant = {}
  
  for column, values in p_values.items():
    min_p_value_idx = np.argmin(np.array(values))
    final_p_values[column] = values[min_p_value_idx]
    final_scores[column] = scores[column][min_p_value_idx]
    final_values_significant[column] = []
    for i, determine in enumerate(np.array(values)<=0.1):
      if determine:
        final_values_significant[column].append(values_significant[column][i])
    final_values_significant[column] = ';'.join(final_values_significant[column])
  return final_scores, final_p_values, final_values_significant

def find_min_p_value(columns, scores, numerical_columns, columns_to_encode, factor_column):
  min_p_values = defaultdict(int)
  for col_, p_value in zip(columns, scores):
    if col_ in numerical_columns:
      min_p_values[col_] = min(min_p_values[col_], p_value) if col_ in min_p_values else p_value
      continue
    for each in columns_to_encode:
      if each in col_:
        min_p_values[each] = min(min_p_values[each], p_value) if each in min_p_values else p_value
  return min_p_values

def get_permutation_importance(X_df, y, n_repeats, random_state, numerical_columns, columns_to_encode, factor_column):

  # train random forest regressor
  rf_model = RandomForestRegressor(n_estimators=100, random_state=random_state, n_jobs=24)
  rf_model.fit(X_df, y.values.ravel())
  # calcualte the permutation importance
  result = permutation_importance(rf_model, X_df, y.values.ravel(), n_repeats=n_repeats, random_state=random_state, n_jobs=24)
  
  # aggregate permutation importances for categorical variables
  permutation_importance_scores = aggregate_importance_score(X_df.columns, result.importances_mean, numerical_columns, columns_to_encode, factor_column)
  permutation_importance_df =  pd.DataFrame(permutation_importance_scores.items(), columns=['features', 'permutation_importance'])
  
  return permutation_importance_df

def get_lr_importance(X_df, y, numerical_columns, columns_to_encode, factor_column):
  X_const = sm.add_constant(X_df)
  model = sm.OLS(y.values.ravel(), X_const)
  est = model.fit()
  
  coef_summary = est.summary2().tables[1]['Coef.'].reset_index()
  lr_importance_scores_dict = lr_aggregate_importance_score(coef_summary['index'], coef_summary['Coef.'], numerical_columns, columns_to_encode, factor_column)


  p_value_summary = est.summary2().tables[1]['P>|t|'].reset_index()
  lr_p_values_dict, values_significant_dict = lr_aggregate_p_value(p_value_summary['index'], p_value_summary['P>|t|'], numerical_columns, columns_to_encode, factor_column)
  
  p_value_threshold = 0.1
  lr_importance_scores, min_p_values,  values_significant = finalize_importance_score_and_p_value(lr_importance_scores_dict, lr_p_values_dict, values_significant_dict, p_value_threshold)
  lr_importance_df =  pd.DataFrame(lr_importance_scores.items(), columns=['features', 'lr_importance'])
  lr_min_p_value_df =  pd.DataFrame(min_p_values.items(), columns=['features', 'min_p_value'])
  lr_values_significant_df =  pd.DataFrame(values_significant.items(), columns=['features', 'significant_values'])
  return lr_importance_df.merge(lr_min_p_value_df.merge(lr_values_significant_df, on=['features']), on=['features'])

In [0]:
columns_to_encode = ['employment_status', 
                     'living_status',
                     'ethnicity',
                     'region'
                      ]
numerical_columns = ['gender', 
                     'marital_status',
                     'education_level',
                     'presence_of_children',
                     'urbanization', 
                     'age', 
                     'household_income']
factor_column = 'for_comparison'
target_column = ['total_spent_weighted']

In [0]:
def cal_feature_importance(event_data_df, processed_demo_data_df, category, month_back, test_end_date, columns_to_encode, numerical_columns, factor_column, target_column):
  train_start_date, train_end_date, test_start_date, test_end_date = comparison_mode(month_back, test_end_date)
  selected_event_data_df = event_data_df.filter((((col('date') >= lit(test_start_date)) & (col('date') <= lit(test_end_date))) | ((col('date') >= lit(train_start_date)) & (col('date') <= lit(train_end_date)))) & (col('category') == category))
  # add another feature that indicates if the people make purchases in the target time window of the comparison
  selected_event_data_df = selected_event_data_df.withColumn(factor_column, when((col('date') >= lit(test_start_date)) & (col('date') <= lit(test_end_date)), 1).otherwise(0))
  
  joined_df = selected_event_data_df.join(processed_demo_data_df, 'user_id', 'inner')
  ll = joined_df.toPandas().dropna()
  if ll.shape[0] <= 10:
    return None
  X, y = create_standard_data(ll, columns_to_encode, numerical_columns, factor_column, target_column)
  permutation_importance_df = get_permutation_importance(X, y, 10, 0, numerical_columns, columns_to_encode, factor_column)
  
  lr_importance_df = get_lr_importance(X, y, numerical_columns, columns_to_encode, factor_column)

  feature_importance_df = permutation_importance_df.merge(lr_importance_df, on=['features'])
  feature_importance_df['category'] = category
  feature_importance_df['target_year'] = str(pd.to_datetime(test_end_date).year)
  feature_importance_df['target_month'] = str(pd.to_datetime(test_end_date).month)
  feature_importance_df['month_back'] = month_back
  return feature_importance_df

In [0]:
all_retalers_transaction_data_delta_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/all_retailers_transaction_per_category_per_person"
event_data_df = spark.read.option("header", "true").format("delta").load(all_retalers_transaction_data_delta_file_path)

#############################
# test_end_date = "2022-2-31"
# target_test_end_date = (pd.to_datetime('2022-06-01') - timedelta(days=1)).strftime("%Y-%m-%d")
##########
target_test_end_dates = [(pd.to_datetime('2022-09-01') - timedelta(days=1)).strftime("%Y-%m-%d"),
                         (pd.to_datetime('2022-10-01') - timedelta(days=1)).strftime("%Y-%m-%d"),
                         (pd.to_datetime('2022-11-01') - timedelta(days=1)).strftime("%Y-%m-%d")
                         ]
#########

month_back_list = [1, 2, 3, 12]
categories = []
for each in event_data_df.select('category').distinct().collect():
  if len(each['category']) > 0:
    categories.append(each['category'])


In [0]:
def cal_feature_importance_(ll, columns_to_encode, numerical_columns, factor_column, target_column):
  if ll.shape[0] <= 10:
    return None
  X, y = create_standard_data(ll, columns_to_encode, numerical_columns, factor_column, target_column)
  permutation_importance_df = get_permutation_importance(X, y, 10, 0, numerical_columns, columns_to_encode, factor_column)
  
  lr_importance_df = get_lr_importance(X, y, numerical_columns, columns_to_encode, factor_column)

  feature_importance_df = permutation_importance_df.merge(lr_importance_df, on=['features'])
  feature_importance_df['category'] = category
  feature_importance_df['target_year'] = str(pd.to_datetime(test_end_date).year)
  feature_importance_df['target_month'] = str(pd.to_datetime(test_end_date).month)
  feature_importance_df['month_back'] = month_back
  return feature_importance_df

In [0]:
#####
# if __name__ == "__main__":
results = []
for target_test_end_date in target_test_end_dates:
  for month_back in month_back_list:
    # select data for the specified window time
    train_start_date, train_end_date, test_start_date, test_end_date = comparison_mode(month_back, target_test_end_date)
    selected_event_data_df = event_data_df.filter((((col('date') >= lit(test_start_date)) & (col('date') <= lit(test_end_date))) \
                                                   | ((col('date') >= lit(train_start_date)) & (col('date') <= lit(train_end_date)))))

    # add another feature that indicates if the people make purchases in the target time window of the comparison
    selected_event_data_df = selected_event_data_df.withColumn(factor_column, when((col('date') >= lit(test_start_date)) & (col('date') <= lit(test_end_date)), 1).otherwise(0))
    joined_df = selected_event_data_df.join(processed_demo_data_df, 'user_id', 'inner')
    prepared_df = joined_df.toPandas().dropna()

    # loop all categories
    for i, category in enumerate(categories):  
        pool = Pool(processes = 24)
        print("Target window: {} ({}/{})  Now processing feature importance analysis for [{}] ===========>  Analyzing {}-month comparison".format(test_end_date, i+1, len(categories), category, month_back))
        results.append(pool.apply_async(cal_feature_importance_, (prepared_df[prepared_df.category==category],
                                                                 columns_to_encode, 
                                                                 numerical_columns, 
                                                                 factor_column, 
                                                                 target_column)))
#         new_feature_importance = cal_feature_importance(event_data_df, processed_demo_data_df, category, month_back, target_test_end_date, columns_to_encode, numerical_columns, factor_column, target_column)

#         if new_feature_importance is not None:
#           feature_importance_collection = pd.concat([feature_importance_collection, new_feature_importance])
        pool.close()
        pool.join()
    print("="*100)
  print("*"*200)

results = [result.get() for result in results if result is not None ]
feature_importance_collection = pd.concat(results, ignore_index=True) 


In [0]:
# feature_importance_collection = pd.DataFrame()
# #####
# for target_test_end_date in target_test_end_dates:
#   for i, category in enumerate(categories):  
#     for month_back in month_back_list:
#         print("Target window: {} ({}/{})  Now processing feature importance analysis for [{}] ===========>  Analyzing {}-month comparison".format(target_test_end_date, i+1, len(categories), category, month_back))
#         new_feature_importance = cal_feature_importance(event_data_df, processed_demo_data_df, category, month_back, target_test_end_date, columns_to_encode, numerical_columns, factor_column, target_column)
#         if new_feature_importance is not None:
#           feature_importance_collection = pd.concat([feature_importance_collection, new_feature_importance])
#     print("="*100)
#   print("*"*200)

In [0]:
feature_importance_py_df = spark.createDataFrame(feature_importance_collection)
dashboard_feature_importance_delta_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/all_retailers_dashboard_feature_importance"

# for initiation
dbutils.fs.rm(dashboard_feature_importance_delta_file_path, True)
feature_importance_py_df.write.format("delta").save(dashboard_feature_importance_delta_file_path)

# for update
# feature_importance_py_df.write.format("delta").mode('append').save(dashboard_feature_importance_delta_file_path)


In [0]:
%sql
drop table if exists disqo_dashboard_db.all_retailers_dashboard_feature_importance;
create TABLE disqo_dashboard_db.all_retailers_dashboard_feature_importance
USING delta
LOCATION "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/all_retailers_dashboard_feature_importance"