In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [None]:
data = pd.read_csv("/content/filtered_data7_withoutSMOTE.csv")
print(data.shape)
data.head()

(2052, 27)


Unnamed: 0,price,retail_price,units_sold,uses_ad_boosts,rating,rating_count,badges_count,product_variation_inventory,shipping_option_price,countries_shipped_to,...,product_color_blue,product_color_green,product_color_grey,product_color_pink,product_color_purple,product_color_red,product_color_white,product_color_yellow,variation_size_label_encoded,has_urgency_banner
0,16.0,14,100,0,3.76,54,0,50,4,34,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1
1,8.0,22,20000,1,3.45,6135,0,50,2,41,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1
2,8.0,43,100,0,3.57,14,0,1,3,36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1
3,8.0,8,5000,1,4.03,579,0,50,2,41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
4,2.72,3,100,1,3.1,20,0,1,1,35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1


In [None]:
data.columns

Index(['price', 'retail_price', 'units_sold', 'uses_ad_boosts', 'rating',
       'rating_count', 'badges_count', 'product_variation_inventory',
       'shipping_option_price', 'countries_shipped_to', 'inventory_total',
       'merchant_rating_count', 'merchant_rating',
       'merchant_has_profile_picture', 'origin_country_CN',
       'shipping_option_Livraison', 'product_color_black',
       'product_color_blue', 'product_color_green', 'product_color_grey',
       'product_color_pink', 'product_color_purple', 'product_color_red',
       'product_color_white', 'product_color_yellow',
       'variation_size_label_encoded', 'has_urgency_banner'],
      dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2052 entries, 0 to 2051
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   price                         2052 non-null   float64
 1   retail_price                  2052 non-null   int64  
 2   units_sold                    2052 non-null   int64  
 3   uses_ad_boosts                2052 non-null   int64  
 4   rating                        2052 non-null   float64
 5   rating_count                  2052 non-null   int64  
 6   badges_count                  2052 non-null   int64  
 7   product_variation_inventory   2052 non-null   int64  
 8   shipping_option_price         2052 non-null   int64  
 9   countries_shipped_to          2052 non-null   int64  
 10  inventory_total               2052 non-null   int64  
 11  merchant_rating_count         2052 non-null   int64  
 12  merchant_rating               2052 non-null   float64
 13  mer

In [None]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,2052.0,8.288557,3.654338,1.0,5.89,8.0,11.0,49.0
retail_price,2052.0,23.282651,29.351948,1.0,7.0,10.0,26.0,252.0
units_sold,2052.0,4168.074561,8839.072943,1.0,100.0,1000.0,5000.0,100000.0
uses_ad_boosts,2052.0,0.374756,0.484178,0.0,0.0,0.0,1.0,1.0
rating,2052.0,3.702557,0.712796,0.0,3.510771,3.81,4.07,5.0
rating_count,2052.0,863.618421,1909.948057,0.0,26.0,160.5,829.25,20744.0
badges_count,2052.0,0.071637,0.277965,0.0,0.0,0.0,0.0,2.0
product_variation_inventory,2052.0,32.892788,20.505442,1.0,9.0,50.0,50.0,50.0
shipping_option_price,2052.0,2.233431,0.953627,1.0,2.0,2.0,3.0,12.0
countries_shipped_to,2052.0,40.296296,19.224987,6.0,31.75,39.0,43.0,140.0


# Helper Functions

In [None]:
def regress_on_outcome(df, outcome, confounder):
    # regress current variable on outcome
    X = df[confounder]
    y = df[outcome]
    X = sm.add_constant(X)
    model = sm.OLS(y, X)
    results = model.fit()

    coefficients = results.params
    slope = round(coefficients[confounder], 2)
    p_val = round(results.pvalues.iloc[1], 2)
    r_squared = round(results.rsquared, 2)
    # p_val_f_stat = round(results.f_pvalue, 2)

    return [slope, p_val, r_squared]

def regress_on_treatment(df, treatment, confounder):
    # regress current variable on treatment
    X = df[confounder]
    t = df[treatment]
    X = sm.add_constant(X)
    model = sm.OLS(t, X)
    results = model.fit()

    coefficients = results.params
    slope = round(coefficients[confounder], 2)
    p_val = round(results.pvalues.iloc[1], 2)
    r_squared = round(results.rsquared, 2)
    # p_val_f_stat = round(results.f_pvalue, 2)

    return [slope, p_val, r_squared]



In [None]:
def univariate_regressions_for_confounder_id(df, outcome, treatment):

    # outcome = "units_sold"
    # treatment = "overall_rating"

    possible_confounders_list = df.columns.tolist()
    possible_confounders_list = [c for c in possible_confounders_list if c not in [outcome, treatment]]
    print(f"Possible Confounders for Outcome y = {outcome} and Treatment t = {treatment}")
    print(possible_confounders_list)

    # df to store model results
    results_df = pd.DataFrame(columns=["Variable", "Slope_with_Outcome", "P_Val_with_Outcome",  "R_Sq_with_Outcome",  "Slope_with_Treatment", "P_Val_with_Treatment",  "R_Sq_with_Treatment", "Is_Potential_Confounder"])


    for current_var in possible_confounders_list:
        # regress current variable on outcome and treatment separately
        res_outcome = regress_on_outcome(df, outcome, current_var)
        res_treatment = regress_on_treatment(df, treatment, current_var)

        # all values rounded to 2 decimal places in res_outcome and res_treatment
        is_confounder = ["Yes" if ((res_outcome[1] < 0.05) and (res_treatment[1] < 0.05)) else "No"]

        row = [current_var] + res_outcome + res_treatment + is_confounder

        results_df.loc[len(results_df)] = row

    return results_df



# Univariate Regressions for Confounder Id

In [None]:
data.columns

Index(['price', 'retail_price', 'units_sold', 'uses_ad_boosts', 'rating',
       'rating_count', 'badges_count', 'product_variation_inventory',
       'shipping_option_price', 'countries_shipped_to', 'inventory_total',
       'merchant_rating_count', 'merchant_rating',
       'merchant_has_profile_picture', 'origin_country_CN',
       'shipping_option_Livraison', 'product_color_black',
       'product_color_blue', 'product_color_green', 'product_color_grey',
       'product_color_pink', 'product_color_purple', 'product_color_red',
       'product_color_white', 'product_color_yellow',
       'variation_size_label_encoded', 'has_urgency_banner'],
      dtype='object')

## Avg prod rating as Treatment

In [None]:
outcome = "units_sold"
treatment = "rating"

res_1 = univariate_regressions_for_confounder_id(data, outcome, treatment)
res_1


Possible Confounders for Outcome y = units_sold and Treatment t = rating
['price', 'retail_price', 'uses_ad_boosts', 'rating_count', 'badges_count', 'product_variation_inventory', 'shipping_option_price', 'countries_shipped_to', 'inventory_total', 'merchant_rating_count', 'merchant_rating', 'merchant_has_profile_picture', 'origin_country_CN', 'shipping_option_Livraison', 'product_color_black', 'product_color_blue', 'product_color_green', 'product_color_grey', 'product_color_pink', 'product_color_purple', 'product_color_red', 'product_color_white', 'product_color_yellow', 'variation_size_label_encoded', 'has_urgency_banner']


Unnamed: 0,Variable,Slope_with_Outcome,P_Val_with_Outcome,R_Sq_with_Outcome,Slope_with_Treatment,P_Val_with_Treatment,R_Sq_with_Treatment,Is_Potential_Confounder
0,price,-16.28,0.76,0.0,0.02,0.0,0.01,No
1,retail_price,8.88,0.18,0.0,0.0,0.27,0.0,No
2,uses_ad_boosts,-388.64,0.34,0.0,-0.05,0.12,0.0,No
3,rating_count,4.23,0.0,0.83,0.0,0.0,0.01,Yes
4,badges_count,1347.85,0.05,0.0,0.41,0.0,0.03,No
5,product_variation_inventory,65.07,0.0,0.02,0.0,0.0,0.01,Yes
6,shipping_option_price,-234.72,0.25,0.0,0.05,0.01,0.0,No
7,countries_shipped_to,-12.61,0.21,0.0,0.0,0.36,0.0,No
8,inventory_total,-0.89,0.99,0.0,-0.01,0.2,0.0,No
9,merchant_rating_count,0.03,0.0,0.05,0.0,0.01,0.0,Yes


In [None]:
print(f"Confounders Identified for Outcome y = {outcome} and Treatment t = {treatment}")
res_1[res_1["Is_Potential_Confounder"] == "Yes"]

Confounders Identified for Outcome y = units_sold and Treatment t = rating


Unnamed: 0,Variable,Slope_with_Outcome,P_Val_with_Outcome,R_Sq_with_Outcome,Slope_with_Treatment,P_Val_with_Treatment,R_Sq_with_Treatment,Is_Potential_Confounder
3,rating_count,4.23,0.0,0.83,0.0,0.0,0.01,Yes
5,product_variation_inventory,65.07,0.0,0.02,0.0,0.0,0.01,Yes
9,merchant_rating_count,0.03,0.0,0.05,0.0,0.01,0.0,Yes
10,merchant_rating,4866.23,0.0,0.01,0.76,0.0,0.05,Yes
11,merchant_has_profile_picture,3051.07,0.0,0.01,0.1,0.04,0.0,Yes
14,product_color_black,1498.05,0.0,0.0,0.11,0.01,0.0,Yes
22,product_color_yellow,-2928.4,0.0,0.01,-0.2,0.0,0.0,Yes
23,variation_size_label_encoded,-871.64,0.0,0.01,-0.04,0.01,0.0,Yes


## uses_ad_boosts as Treament

In [None]:
outcome = "units_sold"
treatment = "uses_ad_boosts"

res_2 = univariate_regressions_for_confounder_id(data, outcome, treatment)
print(f"Confounders Identified for Outcome y = {outcome} and Treatment t = {treatment}")
res_2[res_2["Is_Potential_Confounder"] == "Yes"]


Possible Confounders for Outcome y = units_sold and Treatment t = uses_ad_boosts
['price', 'retail_price', 'rating', 'rating_count', 'badges_count', 'product_variation_inventory', 'shipping_option_price', 'countries_shipped_to', 'inventory_total', 'merchant_rating_count', 'merchant_rating', 'merchant_has_profile_picture', 'origin_country_CN', 'shipping_option_Livraison', 'product_color_black', 'product_color_blue', 'product_color_green', 'product_color_grey', 'product_color_pink', 'product_color_purple', 'product_color_red', 'product_color_white', 'product_color_yellow', 'variation_size_label_encoded', 'has_urgency_banner']
Confounders Identified for Outcome y = units_sold and Treatment t = uses_ad_boosts


Unnamed: 0,Variable,Slope_with_Outcome,P_Val_with_Outcome,R_Sq_with_Outcome,Slope_with_Treatment,P_Val_with_Treatment,R_Sq_with_Treatment,Is_Potential_Confounder
3,rating_count,4.23,0.0,0.83,-0.0,0.03,0.0,Yes
5,product_variation_inventory,65.07,0.0,0.02,-0.0,0.0,0.02,Yes
11,merchant_has_profile_picture,3051.07,0.0,0.01,0.08,0.02,0.0,Yes


## has_urgency_banner as Treament

In [None]:
outcome = "units_sold"
treatment = "has_urgency_banner"

res_3 = univariate_regressions_for_confounder_id(data, outcome, treatment)
print(f"Confounders Identified for Outcome y = {outcome} and Treatment t = {treatment}")
res_3[res_3["Is_Potential_Confounder"] == "Yes"]


Possible Confounders for Outcome y = units_sold and Treatment t = has_urgency_banner
['price', 'retail_price', 'uses_ad_boosts', 'rating', 'rating_count', 'badges_count', 'product_variation_inventory', 'shipping_option_price', 'countries_shipped_to', 'inventory_total', 'merchant_rating_count', 'merchant_rating', 'merchant_has_profile_picture', 'origin_country_CN', 'shipping_option_Livraison', 'product_color_black', 'product_color_blue', 'product_color_green', 'product_color_grey', 'product_color_pink', 'product_color_purple', 'product_color_red', 'product_color_white', 'product_color_yellow', 'variation_size_label_encoded']
Confounders Identified for Outcome y = units_sold and Treatment t = has_urgency_banner


Unnamed: 0,Variable,Slope_with_Outcome,P_Val_with_Outcome,R_Sq_with_Outcome,Slope_with_Treatment,P_Val_with_Treatment,R_Sq_with_Treatment,Is_Potential_Confounder
12,merchant_has_profile_picture,3051.07,0.0,0.01,-0.18,0.0,0.01,Yes
13,origin_country_CN,3094.19,0.0,0.0,-0.21,0.0,0.01,Yes
15,product_color_black,1498.05,0.0,0.0,0.08,0.01,0.0,Yes
