In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
path = "C:/Users/jacob.carey/Documents/consentv2/dataset_revisions/MASTERv7_50.csv"

In [None]:
df = pd.read_csv(path, usecols=[
    'Unnamed: 0'
    'ClearLanguage', 
    'ConsentDetailCategories',
    'ConsentDetailVendors', 
    'ConsentTypeId', 
    'CountryId', 
    'ConsentBool',
    #'Date',
    'GDPREnabled', 
    'IABEnabled', 
    'Industry', 
    'L2Enabled', 
    'OnBrand',
    'PrivacyAccessTypeId'  
], dtype={
    'Unnamed: 0': np.int32,
    'ClearLanguage': np.int8, 
    'ConsentBool': np.int8, 
    'ConsentDetailCategories': np.int8,
    'ConsentDetailVendors': np.int8, 
    'ConsentTypeId': np.int8, 
    'CountryId': np.int8, 
    #'Date': np.object,
    'GDPREnabled': np.int8, 
    'IABEnabled': np.int8, 
    'Industry': np.int8, 
    'L2Enabled': np.int8, 
    'OnBrand': np.int8,
    'PrivacyAccessTypeId': np.int8
})

In [None]:
outcome_col = 'ConsentBool'
id_col = 'Unnamed: 0'
feature_cols = [
    'ClearLanguage', 
    'ConsentDetailCategories',
    'ConsentDetailVendors', 
    'ConsentTypeId', 
    'CountryId', 
    #'ConsentBool',
    #'Date',
    'GDPREnabled', 
    'IABEnabled', 
    'Industry', 
    'L2Enabled', 
    'OnBrand',
    'PrivacyAccessTypeId'  
]

In [None]:
# global counts
total_count = len(df.index)
total_positives = len(df[(df[outcome_col] == True)])

# An empty list to hold the count results
count_list = list()

# Iterate over each feature to collect individual counts per value
for feature in feature_cols:
    counts_df = pd.DataFrame(df.groupby([feature,])[id_col].count()).nlargest(20,[id_col])  # only top 20
    counts_df.rename({id_col: 'count'}, axis='columns', inplace=True)
    positives_df = pd.DataFrame(df[(df[outcome_col] == True)].groupby([feature,])[id_col].count())  # all values
    positives_df.rename({id_col: 'positives'}, axis='columns', inplace=True)
    # merge these dataframes
    merge_df = counts_df.merge(positives_df, left_index=True, right_index=True)
    # iterate over rows, building a dict per row, and append each to list of counts
    for this_row in merge_df.iterrows():
        this_dict = {
            'feature': feature,
            'value': str(this_row[0]),
            'total_count': total_count,
            'total_positives': total_positives,
            'count': this_row[1]['count'],
            'positives': this_row[1]['positives']
        }
        count_list.append(this_dict)

# Create a new dataframe from the aggregated list
lift_df = pd.DataFrame(count_list)

# Now let's add some calculations for probabilities and lift per row
lift_df['total_prob'] = lift_df['total_positives'] / lift_df['total_count']
lift_df['prob'] = lift_df['positives'] / lift_df['count']
lift_df['lift - more likely'] = lift_df['prob'] / lift_df['total_prob']
lift_df['lift - less likely'] = 1 / lift_df['lift - more likely']
lift_df['prct_total'] = lift_df['count'] / lift_df['total_count']

In [None]:
date_time = datetime.today().strftime('%m-%d-%H%M')
output_filepath = "C:/Users/jacob.carey/Documents/consentv2/univariate/consentv2_univariate_{0}.csv'.format(date_time)
lift_df.to_csv(output_filepath, index=False)