In [1]:
import pandas as pd
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

cred = credentials.Certificate("cert.json")
firebase_admin.initialize_app(cred)

LOCATION_PROCESSED = 'data_processed'
LOCATION_SURVEY = f'{LOCATION_PROCESSED}/survey_responses.csv'
LOCATION_QUESTIONS = f'{LOCATION_PROCESSED}/questions.json'

In [2]:
# load firestore data
db = firestore.client()

In [3]:
# load all documents from the collection
docs = db.collection("responses").stream()
docs = list(docs)
print("n responses: ", len(docs))

# create empty dataframe
df = pd.DataFrame()

for doc in docs:
    data = doc.to_dict()["responses"]
    data = dict(data)
    print(data)
    
    # get key value pairs
    kv_pairs = []
    for key, value in data.items():
        kv_pairs.append([key, value])
    
    print(kv_pairs)

    # convert keys to ints
    kv_pairs = [[int(x[0]), x[1]] for x in kv_pairs]
    print(kv_pairs)

    # set data on dataframe
    for kv in kv_pairs:
        df.at[doc.id, kv[0]] = kv[1]

# order columns
df = df.reindex(sorted(df.columns), axis=1)

# remove index column
df.reset_index(drop=True, inplace=True)

df.head()


n responses:  221
{'37': '2', '31': '4', '46': 'Yes', '5': 'Finance / Accounting', '49': '3', '59': '4', '35': '3', '50': 'Yes', '34': 'No', '42': 'No', '7': 'Strongly disagree', '9': 'Neither agree nor disagree', '55': '3', '22': 'Yes', '2': 'Male', '57': '1', '8': 'Agree', '3': 'Denmark', '30': 'Yes', '51': '3', '56': 'No', '24': 'Yes', '52': 'No', '29': '3', '26': 'No', '10': 'Food and Beverages', '32': 'Yes', '6': 'Once every three months', '61': '3', '25': '5', '48': 'Maybe', '28': 'Yes', '54': 'Maybe', '60': 'No', '23': '4', '33': '4', '11': 'Agree', '38': 'Yes', '41': '5', '44': 'No', '58': 'Yes', '43': '4', '1': '18-24', '39': '3', '36': 'Yes', '21': '5', '4': 'Undergraduate', '20': 'Yes', '47': '4', '45': '4', '40': 'No', '53': '2', '27': '2'}
[['37', '2'], ['31', '4'], ['46', 'Yes'], ['5', 'Finance / Accounting'], ['49', '3'], ['59', '4'], ['35', '3'], ['50', 'Yes'], ['34', 'No'], ['42', 'No'], ['7', 'Strongly disagree'], ['9', 'Neither agree nor disagree'], ['55', '3'], ['22

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,52,53,54,55,56,57,58,59,60,61
0,18-24,Male,Denmark,Undergraduate,Finance / Accounting,Once every three months,Strongly disagree,Agree,Neither agree nor disagree,Food and Beverages,...,No,2,Maybe,3,No,1,Yes,4,No,3
1,25-34,Female,United States,Part-time Employee,Other,Once every three months,Agree,Strongly agree,Agree,Toys and Games|Books and Media,...,Maybe,3,Yes,5,Yes,1,Yes,1,No,3
2,25-34,Male,Denmark,Unemployed,Technology / IT,Rarely / Never,Strongly disagree,Strongly disagree,Strongly disagree,,...,Yes,5,Maybe,3,Yes,5,No,3,No,3
3,25-34,Male,Denmark,Graduate / Postgraduate,Technology / IT,Once a month,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Electronics and Gadgets|Toys and Games|Health ...,...,No,3,Maybe,3,Maybe,2,Maybe,2,No,3
4,25-34,Male,Finland,Undergraduate,Finance / Accounting,Once every three months,Disagree,Agree,Agree,Electronics and Gadgets|Clothing and Accessories,...,Yes,4,No,3,Yes,4,Maybe,3,No,3


In [4]:
# load questions
df_questions = pd.read_json(LOCATION_QUESTIONS)
print("n questions: ", len(df_questions))

# convert id to int
df_questions["id"] = df_questions["id"].astype(int)

# use id as index
df_questions = df_questions.set_index("id")

# conver required from 1.0/NaN to bool
df_questions["required"] = df_questions["required"].map(lambda x: x == 1.0)

# count required questions
n_required = df_questions["required"].sum()
print("n required questions: ", n_required)

df_questions.tail()


n questions:  54
n required questions:  31


Unnamed: 0_level_0,title,type,options,required,placeholder,subtitle,shuffle,strategy,taxonomicalParentIndex,shortStrategy,example
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
47,How effective do you perceive this strategy to...,RangeInput,"[1, 2, 3, 4, 5]",False,,Rate on scale from 1 (Not effective) to 5 (Ver...,,"Create a budget, and limit spending on specifi...",0.0,Create Budget,"Before shopping, you put down a budget and lim..."
22,Have you tried this strategy before?,RadioInput,"[Yes, No, Maybe]",True,,,,"After committing to purchasing a product, enfo...",4.0,Enforce Wait Time,"In the midst of buying the product, you either..."
23,How effective do you perceive this strategy to...,RangeInput,"[1, 2, 3, 4, 5]",False,,Rate on scale from 1 (Not effective) to 5 (Ver...,,"After committing to purchasing a product, enfo...",4.0,Enforce Wait Time,"In the midst of buying the product, you either..."
52,Have you tried this strategy before?,RadioInput,"[Yes, No, Maybe]",True,,,,Stop yourself in midst of shopping to find alt...,1.0,Do Alternative Activity,"After browsing a webshop/app for some time, yo..."
53,How effective do you perceive this strategy to...,RangeInput,"[1, 2, 3, 4, 5]",False,,Rate on scale from 1 (Not effective) to 5 (Ver...,,Stop yourself in midst of shopping to find alt...,1.0,Do Alternative Activity,"After browsing a webshop/app for some time, yo..."


In [5]:
required_question_ids = list(df_questions[df_questions["required"]].index)
required_question_ids = sorted(required_question_ids)
print("required question ids: ", required_question_ids)

required question ids:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60]


In [6]:
# filter out responses that do not have all required questions
df_filtered = df.dropna(subset=required_question_ids, how="any")
print("n responses after filtering out partial responses: ", len(df_filtered))
print(f"dropped {len(df) - len(df_filtered)} partial responses")

n responses after filtering out partial responses:  215
dropped 6 partial responses


In [7]:
# throw out all "Under 18" responses
df_filtered = df_filtered[df_filtered[1] != "Under 18"]
print("n responses after filtering out under 18: ", len(df_filtered))
df_filtered.head()

n responses after filtering out under 18:  204


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,52,53,54,55,56,57,58,59,60,61
0,18-24,Male,Denmark,Undergraduate,Finance / Accounting,Once every three months,Strongly disagree,Agree,Neither agree nor disagree,Food and Beverages,...,No,2,Maybe,3,No,1,Yes,4,No,3
1,25-34,Female,United States,Part-time Employee,Other,Once every three months,Agree,Strongly agree,Agree,Toys and Games|Books and Media,...,Maybe,3,Yes,5,Yes,1,Yes,1,No,3
2,25-34,Male,Denmark,Unemployed,Technology / IT,Rarely / Never,Strongly disagree,Strongly disagree,Strongly disagree,,...,Yes,5,Maybe,3,Yes,5,No,3,No,3
3,25-34,Male,Denmark,Graduate / Postgraduate,Technology / IT,Once a month,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Electronics and Gadgets|Toys and Games|Health ...,...,No,3,Maybe,3,Maybe,2,Maybe,2,No,3
4,25-34,Male,Finland,Undergraduate,Finance / Accounting,Once every three months,Disagree,Agree,Agree,Electronics and Gadgets|Clothing and Accessories,...,Yes,4,No,3,Yes,4,Maybe,3,No,3


In [8]:
# save to csv
df_filtered.to_csv(LOCATION_SURVEY)