In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.formula.api as smf
from pandas.core.dtypes.common import is_numeric_dtype

data = pd.read_csv("data.csv")
data = data.set_index('projectID')
data = data.drop(columns=['drives'])
data

Unnamed: 0_level_0,success,amount,female_creator,goal,words,backers_count,serial_entrepreneur,duration,category_art,category_comics,...,year_2020,year_2021,year_2022,media,sustainability,story,url,business,empathic,social
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3731940,0,203.0,0,6000.0,71,5,0,60.000000,0,0,...,0,0,0,0,0,app go benefit trainer personnel look get fit ...,https://www.kickstarter.com/projects/fitnesspo...,1.408,0.000,8.451
3451605,1,116639.0,0,10000.0,497,2003,1,14.583148,0,0,...,0,0,0,1,0,even miss kickstarter still get hand papillon ...,https://www.kickstarter.com/projects/kolossalg...,3.414,2.610,9.839
3722583,0,28.0,0,6000.0,443,6,0,30.000000,0,1,...,0,0,0,1,0,journey blak blak blak officially entitle blak...,https://www.kickstarter.com/projects/myheart/b...,0.903,0.677,13.318
3745862,1,1846.0,1,1500.0,474,48,0,39.687488,0,0,...,0,0,0,1,0,isabel jazz folk musical teach empathy metoo s...,https://www.kickstarter.com/projects/annawestb...,1.477,1.899,17.511
3735376,1,1001.0,1,1000.0,130,17,0,25.989560,0,0,...,0,0,0,0,0,main stage kids head new york young performer ...,https://www.kickstarter.com/projects/mainstage...,0.000,0.769,16.154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378823,1,62669.0,0,28000.0,837,232,1,30.000000,0,0,...,0,0,1,1,0,book maui mauka makai companion book maui coas...,https://www.kickstarter.com/projects/danielsul...,0.239,1.912,6.810
4394019,1,11083.0,0,10000.0,167,80,0,26.377384,0,0,...,0,0,1,1,0,myrcene ale co dream mike goergen cy higgins m...,https://www.kickstarter.com/projects/myrceneal...,1.198,3.593,9.581
4385597,1,25265.0,0,25000.0,138,73,0,14.147211,0,0,...,0,0,1,1,0,firefly theatrical need help build theatre spa...,https://www.kickstarter.com/projects/fireflyth...,2.899,0.725,7.971
4367598,1,4225.0,1,4000.0,304,94,0,36.537940,0,0,...,0,0,1,1,0,big feelings coloring book come idea recovery ...,https://www.kickstarter.com/projects/bigfeelin...,1.645,4.276,14.803


In [2]:
# Normalize data
# Box-Cox lambda values
# amount        0.0915075719918227
# goal          0.02059050704045948
# words         0.19404918878650212
# backers_count 0.010289890837174583
# duration      0.4346871019563134
# business      0.039097024902746386
# empathic      0.05775409842214908
# social        0.23771984812837177
# drives        0.22950948679659838

normData = data.copy()

for variable in ['amount', 'goal', 'words', 'backers_count', 'business', 'empathic', 'social']: # + drives
    minimum, secondMinimum = data[variable].drop_duplicates().nsmallest(2)
    if minimum == 0:
        normData[variable] += 0.5*secondMinimum

    normData[variable] = np.log10(normData[variable])
    normData = normData.rename(columns={variable: "norm_" + variable})

normData['duration'] = np.sqrt(normData['duration'])
normData = normData.rename(columns={'duration': 'norm_duration'})

# normData.to_csv('normalisedData.csv')
normData

Unnamed: 0_level_0,success,norm_amount,female_creator,norm_goal,norm_words,norm_backers_count,serial_entrepreneur,norm_duration,category_art,category_comics,...,year_2020,year_2021,year_2022,media,sustainability,story,url,norm_business,norm_empathic,norm_social
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3731940,0,2.307496,0,3.778151,1.851258,0.698970,0,7.745967,0,0,...,0,0,0,0,0,app go benefit trainer personnel look get fit ...,https://www.kickstarter.com/projects/fitnesspo...,0.165393,-1.207608,0.938495
3451605,1,5.066844,0,4.000000,2.696356,3.301681,1,3.818789,0,0,...,0,0,0,1,0,even miss kickstarter still get hand papillon ...,https://www.kickstarter.com/projects/kolossalg...,0.540267,0.426836,1.002922
3722583,0,1.447158,0,3.778151,2.646404,0.778151,0,5.477226,0,1,...,0,0,0,1,0,journey blak blak blak officially entitle blak...,https://www.kickstarter.com/projects/myheart/b...,-0.018408,-0.131356,1.131827
3745862,1,3.266232,1,3.176091,2.675778,1.681241,0,6.299801,0,0,...,0,0,0,1,0,isabel jazz folk musical teach empathy metoo s...,https://www.kickstarter.com/projects/annawestb...,0.185400,0.292478,1.248941
3735376,1,3.000434,1,3.000000,2.113943,1.230449,0,5.097996,0,0,...,0,0,0,0,0,main stage kids head new york young performer ...,https://www.kickstarter.com/projects/mainstage...,-1.255707,-0.080399,1.214380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378823,1,4.797053,0,4.447158,2.922725,2.365488,1,5.477226,0,0,...,0,0,1,1,0,book maui mauka makai companion book maui coas...,https://www.kickstarter.com/projects/danielsul...,-0.530915,0.295347,0.847480
4394019,1,4.044657,0,4.000000,2.222716,1.903090,0,5.135892,0,0,...,0,0,1,1,0,myrcene ale co dream mike goergen cy higgins m...,https://www.kickstarter.com/projects/myrceneal...,0.098124,0.562887,0.991647
4385597,1,4.402519,0,4.397940,2.139879,1.863323,0,3.761278,0,0,...,0,0,1,1,0,firefly theatrical need help build theatre spa...,https://www.kickstarter.com/projects/fireflyth...,0.470484,-0.104025,0.913787
4367598,1,3.625827,1,3.602060,2.482874,1.973128,0,6.044662,0,0,...,0,0,1,1,0,big feelings coloring book come idea recovery ...,https://www.kickstarter.com/projects/bigfeelin...,0.230577,0.637290,1.177002


In [3]:
decimals = 3

In [4]:
# Table 1
mData = data[data['female_creator'] == 0]
fData = data[data['female_creator'] == 1]

categories = ['category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design',
             'category_fashion', 'category_film&video', 'category_food', 'category_games', 'category_journalism',
             'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater']


def cleanPValue(pValue):
    if pValue < 0.001:
        return "< 0.001"
    else:
        return ('{:.%if}' % decimals).format(pValue)


mMeans = []
fMeans = []
tValues = []
pValuesTwo = []
pValuesOne = []
catNames = []
for category in categories:
    mCatData = mData[mData[category] == 1]
    fCatData = fData[fData[category] == 1]

    mMeans.append(('{:.%if}' % decimals).format(np.mean(mCatData['success'])))
    fMeans.append(('{:.%if}' % decimals).format(np.mean(fCatData['success'])))

    # Equal variance check
    mVar = np.var(mCatData['success'])
    fVar = np.var(fCatData['success'])

    bigger = np.maximum(mVar, fVar)
    smaller = np.minimum(mVar, fVar)

    equalVariance = False
    if bigger / smaller < 4:
        equalVariance = True

    # Perform two-sample t-test
    resultTwo = st.ttest_ind(a=fCatData['success'], b=mCatData['success'], equal_var=equalVariance)
    resultOne = st.ttest_ind(a=fCatData['success'], b=mCatData['success'], equal_var=equalVariance, alternative="greater")
    tValues.append(('{:.%if}' % decimals).format(resultTwo[0]))

    pValuesTwo.append(cleanPValue(resultTwo[1]))
    pValuesOne.append(cleanPValue(resultOne[1]))

    catNames.append(category.split('_')[1])

table1 = {"Category": catNames, "Female (mean)": fMeans, "Male (mean)": mMeans, "t-Value": tValues, "p-Value (two-sided)": pValuesTwo, "p-Value (one-sided)": pValuesOne}
table1 = pd.DataFrame.from_dict(table1)
table1 = table1.set_index('Category')
table1.to_csv('Tables/success rate per category.csv')
table1

Unnamed: 0_level_0,Female (mean),Male (mean),t-Value,p-Value (two-sided),p-Value (one-sided)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
art,0.865,0.809,2.389,0.017,0.009
comics,0.884,0.934,-2.131,0.033,0.983
crafts,0.512,0.565,-0.661,0.509,0.745
dance,0.846,0.75,0.417,0.683,0.341
design,0.905,0.945,-1.804,0.072,0.964
fashion,0.809,0.578,5.433,< 0.001,< 0.001
film&video,0.623,0.542,1.959,0.051,0.025
food,0.422,0.482,-1.336,0.182,0.909
games,0.9,0.926,-1.3,0.194,0.903
journalism,0.226,0.415,-1.693,0.095,0.953


In [5]:
# Table 2 (part of table 7)
# mVar = np.var(mData['success'])
# fVar = np.var(fData['success'])
#
# bigger = np.maximum(mVar, fVar)
# smaller = np.minimum(mVar, fVar)
#
# equalVariance = False
# if bigger / smaller < 4:
#     equalVariance = True
#
# result = st.ttest_ind(a=mData['success'], b=fData['success'], equal_var=equalVariance)
# if result[1] < 0.001:
#     result[1] = "< 0.001"
#
# table2 = {'Variable': ['success'], 'Male (mean)': [np.mean(mData['success'])], 'Female (mean)': [np.mean(fData['success'])], 't-Value': [result[0]], 'p-Value': [result[1]]}
# table2 = pd.DataFrame.from_dict(table2)
# table2 = table2.set_index('Variable')
# table2.to_csv('Tables/gender breakdown.csv')
# table2

In [6]:
# Table 3
numVars = []
means = []
medians = []
minima = []
maxima = []
SDs = []

for variable in data.columns.tolist():
    varData = data[variable]
    if is_numeric_dtype(varData):
        numVars.append(variable)
        means.append(('{:.%if}' % decimals).format(np.mean(varData)))
        medians.append(('{:.%if}' % decimals).format(np.median(varData)))
        minima.append(('{:.%if}' % decimals).format(np.min(varData)))
        maxima.append(('{:.%if}' % decimals).format(np.max(varData)))
        SDs.append(('{:.%if}' % decimals).format(np.std(varData)))

table3 = {"Variable": numVars, "Mean": means, "Median": medians, "Minimum": minima, "Maximum": maxima, "SD": SDs}
table3 = pd.DataFrame.from_dict(table3)
table3 = table3.set_index("Variable")
table3.to_csv('Tables/descriptive statistics.csv')
table3

Unnamed: 0_level_0,Mean,Median,Minimum,Maximum,SD
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
success,0.763,1.0,0.0,1.0,0.425
amount,66149.066,3838.0,1.0,272707148.204,2984093.96
female_creator,0.327,0.0,0.0,1.0,0.469
goal,19854.147,4000.0,1.0,14000000.0,188831.801
words,408.228,305.0,2.0,2588.0,362.641
backers_count,345.802,65.0,1.0,55024.0,1595.292
serial_entrepreneur,0.252,0.0,0.0,1.0,0.434
duration,32.19,30.0,1.0,73.958,12.615
category_art,0.121,0.0,0.0,1.0,0.327
category_comics,0.112,0.0,0.0,1.0,0.315


In [7]:
# Table 4
succData = normData[normData['success'] == 1]
failData = normData[normData['success'] == 0]

succMeans = []
failMeans = []
succMedians = []
failMedians = []

meanTValues = []
meanPValuesTwo = []
meanPValuesOne = []

moodsVars = ['norm_amount', 'norm_goal', 'norm_words', 'norm_backers_count', 'norm_duration', 'norm_business', 'norm_empathic', 'norm_social'] # + norm_drives
medianTValues = []
medianPValues = []

numVars_ = []
for var in numVars:
    if var in ['amount', 'goal', 'words', 'backers_count', 'business', 'empathic', 'social', 'duration']:
        var = "norm_" + var
    numVars_.append(var)
numVars = numVars_[:]

for variable in numVars[1:]:
    varDataSucc = succData[variable]
    varDataFail = failData[variable]

    # Compute means and medians
    succMeans.append(('{:.%if}' % decimals).format(np.mean(varDataSucc)))
    failMeans.append(('{:.%if}' % decimals).format(np.mean(varDataFail)))
    succMedians.append(('{:.%if}' % decimals).format(np.median(varDataSucc)))
    failMedians.append(('{:.%if}' % decimals).format(np.median(varDataFail)))

    # Check variance ratio
    succVar = np.var(varDataSucc)
    failVar = np.var(varDataFail)

    bigger = np.maximum(succVar, failVar)
    smaller = np.minimum(succVar, failVar)

    equalVariance = False
    if bigger / smaller < 4:
        equalVariance = True

    # Perform two sample t-test
    resultTwo = st.ttest_ind(a=varDataSucc, b=varDataFail, equal_var=equalVariance)
    resultOne = st.ttest_ind(a=varDataSucc, b=varDataFail, equal_var=equalVariance, alternative='greater')
    meanTValues.append(('{:.%if}' % decimals).format(resultTwo[0]))

    meanPValuesTwo.append(cleanPValue(resultTwo[1]))
    meanPValuesOne.append(cleanPValue(resultOne[1]))

    # Perform Mood's median test
    if variable in moodsVars:
        result = st.median_test(varDataSucc, varDataFail)
        medianTValues.append(('{:.%if}' % decimals).format(result[0]))
        medianPValues.append(cleanPValue(result[1]))
    else:
        medianTValues.append('N/A')
        medianPValues.append('N/A')

table4 = {'Variable': numVars[1:], 'Success (mean)': succMeans, 'Failure (mean)': failMeans, 't-Value': meanTValues, 'p-Value (two-sided)': meanPValuesTwo, 'p-Value (one-sided)': meanPValuesOne}
table4 = pd.DataFrame.from_dict(table4)
table4 = table4.set_index('Variable')
table4.to_csv('Tables/mean per variable (success).csv')
table4

Unnamed: 0_level_0,Success (mean),Failure (mean),t-Value,p-Value (two-sided),p-Value (one-sided)
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
norm_amount,3.835,2.064,78.91,< 0.001,< 0.001
female_creator,0.328,0.324,0.33,0.741,0.371
norm_goal,3.433,3.998,-31.108,< 0.001,1.000
norm_words,2.504,2.274,22.815,< 0.001,< 0.001
norm_backers_count,2.071,0.783,82.97,< 0.001,< 0.001
serial_entrepreneur,0.324,0.019,46.108,< 0.001,< 0.001
norm_duration,5.359,6.234,-32.865,< 0.001,1.000
category_art,0.134,0.08,6.466,< 0.001,< 0.001
category_comics,0.136,0.035,12.578,< 0.001,< 0.001
category_crafts,0.013,0.036,-6.76,< 0.001,1.000


In [8]:
# Table 4 (continued)
table4 = {'Variable': numVars[1:], 'Success (median)': succMedians, 'Failure (median)': failMedians, 'χ²-Value': medianTValues, 'p-Value': medianPValues}
table4 = pd.DataFrame.from_dict(table4)
table4 = table4.set_index('Variable')
table4.to_csv('Tables/median per variable (success).csv')
table4

Unnamed: 0_level_0,Success (median),Failure (median),χ²-Value,p-Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
norm_amount,3.787,2.233,1737.998,< 0.001
female_creator,0.0,0.0,,
norm_goal,3.477,4.0,490.965,< 0.001
norm_words,2.538,2.291,373.029,< 0.001
norm_backers_count,2.017,0.699,2105.926,< 0.001
serial_entrepreneur,0.0,0.0,,
norm_duration,5.477,5.571,343.548,< 0.001
category_art,0.0,0.0,,
category_comics,0.0,0.0,,
category_crafts,0.0,0.0,,


In [9]:
# Table 5
table5 = {
    'money': ['cash', 'fines', 'invest', 'irs', 'market', 'profit', 'rich', 'spend', 'usd', 'worth'],
    'emotion': ['admire', 'clinical depression', 'envies', 'furious', 'heartbreaking', 'misery', 'not a good idea', 'rage', 'tantrum', 'you\'d love'],
    'social': ['admittedly', 'consoling', 'ex-wife', 'ladies man', 'mob', 'papa', 'stalking', 'transgender', 'village', 'white guy'],
    # 'drives': ['abuser', 'chat', 'dear', 'denies', 'excelling', 'law', 'mamacita', 'perfection', 'sergeant', 'sultan']
}
table5 = pd.DataFrame.from_dict(table5)
table5.to_csv('Tables/category examples.csv', index=False)
table5

Unnamed: 0,money,emotion,social
0,cash,admire,admittedly
1,fines,clinical depression,consoling
2,invest,envies,ex-wife
3,irs,furious,ladies man
4,market,heartbreaking,mob
5,profit,misery,papa
6,rich,not a good idea,stalking
7,spend,rage,transgender
8,usd,tantrum,village
9,worth,you'd love,white guy


In [10]:
# Table 7
mNormData = normData[normData['female_creator'] == 0]
fNormData = normData[normData['female_creator'] == 1]

mMeans = []
fMeans = []
tValues = []
pValuesTwo = []
pValuesOne = []

numVars_ = numVars[:]
numVars_.remove('female_creator')
for variable in numVars_:
    mVarData = mNormData[variable]
    fVarData = fNormData[variable]

    # Calculate the means
    mMeans.append(('{:.%if}' % decimals).format(np.mean(mVarData)))
    fMeans.append(('{:.%if}' % decimals).format(np.mean(fVarData)))

    # Check for equal variance
    mVar = np.var(mVarData)
    fVar = np.var(fVarData)

    bigger = np.maximum(mVar, fVar)
    smaller = np.minimum(mVar, fVar)

    equalVariance = False
    if bigger / smaller < 4:
        equalVariance = True

    # Perform two-sample t-test
    resultTwo = st.ttest_ind(a=fVarData, b=mVarData, equal_var=equalVariance)
    resultOne = st.ttest_ind(a=fVarData, b=mVarData, equal_var=equalVariance, alternative="greater")
    tValues.append(('{:.%if}' % decimals).format(resultTwo[0]))

    pValuesTwo.append(cleanPValue(resultTwo[1]))
    pValuesOne.append(cleanPValue(resultOne[1]))

table7 = {"Variable": numVars_, "Female (mean)": fMeans, "Male (mean)": mMeans, "t-Value": tValues, "p-Value (two-sided)": pValuesTwo, "p-Value (one-sided)": pValuesOne}
table7 = pd.DataFrame.from_dict(table7)
table7 = table7.set_index('Variable')
table7.to_csv('Tables/mean per variable (female_creator).csv')
table7

Unnamed: 0_level_0,Female (mean),Male (mean),t-Value,p-Value (two-sided),p-Value (one-sided)
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
success,0.766,0.762,0.33,0.741,0.371
norm_amount,3.333,3.457,-4.637,< 0.001,1.000
norm_goal,3.469,3.614,-8.387,< 0.001,1.000
norm_words,2.421,2.464,-4.58,< 0.001,1.000
norm_backers_count,1.679,1.809,-6.889,< 0.001,1.000
serial_entrepreneur,0.199,0.278,-7.851,< 0.001,1.000
norm_duration,5.556,5.57,-0.541,0.588,0.706
category_art,0.234,0.067,22.725,< 0.001,< 0.001
category_comics,0.054,0.14,-11.902,< 0.001,1.000
category_crafts,0.031,0.012,6.112,< 0.001,< 0.001


In [11]:
# Determine which numerical variables are linearly related to amount
amountData = normData['norm_amount']

rValues = []
pValues = []
linears = []
for variable in numVars:
    rValue, pValue = st.pearsonr(amountData, normData[variable])
    rValues.append(rValue)
    pValues.append(pValue)
    linears.append(pValue < 0.05 and abs(rValue) >= 0.4)

correlations = pd.DataFrame()
correlations['Variable'] = numVars
correlations['r-Value'] = rValues
correlations['p-Value'] = pValues
correlations['Linear'] = linears
correlations = correlations.set_index('Variable')
correlations

Unnamed: 0_level_0,r-Value,p-Value,Linear
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
success,0.65278,0.0,True
norm_amount,1.0,0.0,True
female_creator,-0.050573,3.585913e-06,False
norm_goal,0.187455,3.35655e-67,False
norm_words,0.406203,0.0,True
norm_backers_count,0.910818,0.0,True
serial_entrepreneur,0.263426,3.596709e-133,False
norm_duration,-0.187853,1.750429e-67,False
category_art,-0.055437,3.765543e-07,False
category_comics,0.057758,1.200299e-07,False


In [12]:
# Establish a baseline model
independentVariables = numVars[:]
variablesToRemove = ['success', 'female_creator', 'norm_amount', 'norm_duration', 'norm_business', 'norm_empathic', 'norm_social']

for variable in independentVariables:
    for term in ['category_', 'month_', 'year_']:
        if term in variable:
            if variable not in variablesToRemove:
                variablesToRemove.append(variable)

for variable in variablesToRemove:
    independentVariables.remove(variable)

independentVariables += ['category_comics', 'category_fashion', 'year_2021']

X = sm.add_constant(normData[independentVariables])
y = normData['norm_amount']
baseModel = sm.OLS(y, X).fit()

baseModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,5112.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:30,Log-Likelihood:,-5251.4
No. Observations:,8388,AIC:,10520.0
Df Residuals:,8378,BIC:,10590.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5010,0.037,13.415,0.000,0.428,0.574
norm_goal,0.1133,0.007,16.110,0.000,0.100,0.127
norm_words,0.0416,0.014,2.969,0.003,0.014,0.069
norm_backers_count,1.2409,0.008,164.022,0.000,1.226,1.256
serial_entrepreneur,-0.1105,0.013,-8.785,0.000,-0.135,-0.086
media,0.2691,0.017,15.426,0.000,0.235,0.303
sustainability,0.1267,0.026,4.884,0.000,0.076,0.178
category_comics,-0.1444,0.016,-8.927,0.000,-0.176,-0.113
category_fashion,-0.0474,0.023,-2.103,0.036,-0.091,-0.003

0,1,2,3
Omnibus:,495.066,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1313.716
Skew:,-0.328,Prob(JB):,5.3699999999999995e-286
Kurtosis:,4.825,Cond. No.,39.1


In [13]:
X = sm.add_constant(normData[independentVariables + ['norm_business']])
y = normData['norm_amount']
businessModel = sm.OLS(y, X).fit()

businessModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,4604.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:30,Log-Likelihood:,-5248.7
No. Observations:,8388,AIC:,10520.0
Df Residuals:,8377,BIC:,10600.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4869,0.038,12.877,0.000,0.413,0.561
norm_goal,0.1133,0.007,16.111,0.000,0.099,0.127
norm_words,0.0498,0.014,3.449,0.001,0.022,0.078
norm_backers_count,1.2394,0.008,163.349,0.000,1.225,1.254
serial_entrepreneur,-0.1110,0.013,-8.821,0.000,-0.136,-0.086
media,0.2682,0.017,15.377,0.000,0.234,0.302
sustainability,0.1281,0.026,4.937,0.000,0.077,0.179
category_comics,-0.1488,0.016,-9.141,0.000,-0.181,-0.117
category_fashion,-0.0457,0.023,-2.027,0.043,-0.090,-0.002

0,1,2,3
Omnibus:,495.896,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1313.505
Skew:,-0.329,Prob(JB):,5.97e-286
Kurtosis:,4.824,Cond. No.,39.6


In [14]:
X = sm.add_constant(normData[independentVariables + ['norm_empathic']])
y = normData['norm_amount']
empathicModel = sm.OLS(y, X).fit()

empathicModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,4605.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:30,Log-Likelihood:,-5248.0
No. Observations:,8388,AIC:,10520.0
Df Residuals:,8377,BIC:,10600.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5085,0.037,13.582,0.000,0.435,0.582
norm_goal,0.1146,0.007,16.263,0.000,0.101,0.128
norm_words,0.0337,0.014,2.351,0.019,0.006,0.062
norm_backers_count,1.2415,0.008,164.084,0.000,1.227,1.256
serial_entrepreneur,-0.1079,0.013,-8.549,0.000,-0.133,-0.083
media,0.2694,0.017,15.451,0.000,0.235,0.304
sustainability,0.1274,0.026,4.909,0.000,0.077,0.178
category_comics,-0.1446,0.016,-8.940,0.000,-0.176,-0.113
category_fashion,-0.0500,0.023,-2.219,0.027,-0.094,-0.006

0,1,2,3
Omnibus:,487.249,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1306.857
Skew:,-0.318,Prob(JB):,1.66e-284
Kurtosis:,4.826,Cond. No.,39.3


In [15]:
X = sm.add_constant(normData[independentVariables + ['norm_social']])
y = normData['norm_amount']
socialModel = sm.OLS(y, X).fit()

socialModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,4600.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:31,Log-Likelihood:,-5251.3
No. Observations:,8388,AIC:,10520.0
Df Residuals:,8377,BIC:,10600.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4907,0.041,11.936,0.000,0.410,0.571
norm_goal,0.1131,0.007,16.068,0.000,0.099,0.127
norm_words,0.0400,0.014,2.800,0.005,0.012,0.068
norm_backers_count,1.2414,0.008,162.968,0.000,1.226,1.256
serial_entrepreneur,-0.1099,0.013,-8.699,0.000,-0.135,-0.085
media,0.2701,0.018,15.404,0.000,0.236,0.305
sustainability,0.1270,0.026,4.893,0.000,0.076,0.178
category_comics,-0.1451,0.016,-8.947,0.000,-0.177,-0.113
category_fashion,-0.0467,0.023,-2.071,0.038,-0.091,-0.002

0,1,2,3
Omnibus:,494.082,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1312.528
Skew:,-0.326,Prob(JB):,9.729999999999999e-286
Kurtosis:,4.825,Cond. No.,44.1


In [16]:
X = sm.add_constant(normData[independentVariables + ['norm_business', 'norm_empathic', 'norm_social']])
y = normData['norm_amount']
combinedModel = sm.OLS(y, X).fit()

combinedModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,3839.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:31,Log-Likelihood:,-5245.2
No. Observations:,8388,AIC:,10520.0
Df Residuals:,8375,BIC:,10610.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4915,0.043,11.551,0.000,0.408,0.575
norm_goal,0.1145,0.007,16.218,0.000,0.101,0.128
norm_words,0.0416,0.015,2.804,0.005,0.013,0.071
norm_backers_count,1.2402,0.008,162.524,0.000,1.225,1.255
serial_entrepreneur,-0.1082,0.013,-8.555,0.000,-0.133,-0.083
media,0.2688,0.018,15.333,0.000,0.234,0.303
sustainability,0.1288,0.026,4.964,0.000,0.078,0.180
category_comics,-0.1492,0.016,-9.135,0.000,-0.181,-0.117
category_fashion,-0.0481,0.023,-2.128,0.033,-0.092,-0.004

0,1,2,3
Omnibus:,488.029,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1306.686
Skew:,-0.319,Prob(JB):,1.8099999999999998e-284
Kurtosis:,4.825,Cond. No.,46.1


In [17]:
# Table 8
index = []
independentVariables = ['norm_business', 'norm_social', 'norm_empathic'] + independentVariables
for variable in independentVariables:
    index.append(variable)
    index.append('')
index += ['constant', 'Adj. R-squared', 'df']
table8 = pd.DataFrame(index=index)

models = [baseModel, businessModel, socialModel, empathicModel, combinedModel]
cntr = 1
for model in models:
    variables = model.params.index.tolist()

    data = []
    for variable in independentVariables:
        if variable in variables:
            varIndex = variables.index(variable)
            data.append(('{:.%if}' % decimals).format(model.params.iloc[varIndex]))

            pValue = model.pvalues.iloc[varIndex]
            if pValue < 0.001:
                pValue = '< 0.001'
            else:
                pValue = ('{:.%if}' % decimals).format(pValue)
            data.append('(%s)' % pValue)
        else:
            data.append('')
            data.append('')

    data.append(('{:.%if}' % decimals).format(model.params.iloc[0]))
    data.append(('{:.%if}' % decimals).format(model.rsquared_adj))
    data.append(('{:.%if}' % decimals).format(model.df_model))

    table8['Model ' + str(cntr)] = data
    cntr += 1

table8.to_csv('Tables/OLS models.csv')
table8

Unnamed: 0,Model 1,Model 2,Model 3,Model 4,Model 5
norm_business,,-0.021,,,-0.021
,,(0.019),,,(0.019)
norm_social,,,0.013,,0.003
,,,(0.550),,(0.876)
norm_empathic,,,,0.028,0.027
,,,,(0.008),(0.013)
norm_goal,0.113,0.113,0.113,0.115,0.115
,(< 0.001),(< 0.001),(< 0.001),(< 0.001),(< 0.001)
norm_words,0.042,0.050,0.040,0.034,0.042
,(0.003),(< 0.001),(0.005),(0.019),(0.005)


In [18]:
# Table 6
index = []
variables = ['norm_amount'] + independentVariables[:3] + ['female_creator'] + independentVariables[3:]
for variable in variables:
    index.append(variable)
    index.append('')
table6 = pd.DataFrame(index=index, columns=variables)

noVars = len(variables)
for i in range(noVars):
    rValues = []
    pValues = []

    var1Data = normData[variables[i]]
    for j in range(i + 1):
        if i == j:
            rValues.append(('{:.%if}' % decimals).format(1))
            pValues.append('(< 0.001)')
        else:
            rValue, pValue = st.pearsonr(var1Data, normData[variables[j]])
            rValues.append(('{:.%if}' % decimals).format(rValue))

            if pValue < 0.001:
                pValue = '< 0.001'
            else:
                pValue = ('{:.%if}' % decimals).format(pValue)
            pValues.append('(%s)' % pValue)

    for j in range(i + 1, noVars):
        rValues.append('')
        pValues.append('')

    table6.iloc[2*i] = rValues
    table6.iloc[2*i + 1] = pValues

table6.to_csv('Tables/correlation matrix.csv')
table6

Unnamed: 0,norm_amount,norm_business,norm_social,norm_empathic,female_creator,norm_goal,norm_words,norm_backers_count,serial_entrepreneur,media,sustainability,category_comics,category_fashion,year_2021
norm_amount,1.000,,,,,,,,,,,,,
,(< 0.001),,,,,,,,,,,,,
norm_business,-0.010,1.000,,,,,,,,,,,,
,(0.339),(< 0.001),,,,,,,,,,,,
norm_social,-0.103,0.136,1.000,,,,,,,,,,,
,(< 0.001),(< 0.001),(< 0.001),,,,,,,,,,,
norm_empathic,0.031,0.047,0.279,1.000,,,,,,,,,,
,(0.004),(< 0.001),(< 0.001),(< 0.001),,,,,,,,,,
female_creator,-0.051,0.111,0.100,0.108,1.000,,,,,,,,,
,(< 0.001),(< 0.001),(< 0.001),(< 0.001),(< 0.001),,,,,,,,,


In [19]:
formula = "norm_amount ~ norm_goal + norm_words + norm_backers_count + serial_entrepreneur + media + sustainability + category_comics + category_fashion + year_2021 + female_creator + norm_business + norm_business:female_creator"
businessModel = smf.ols(formula, data=normData).fit()

businessModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,3844.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:31,Log-Likelihood:,-5240.5
No. Observations:,8388,AIC:,10510.0
Df Residuals:,8375,BIC:,10600.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4562,0.039,11.839,0.000,0.381,0.532
norm_goal,0.1165,0.007,16.473,0.000,0.103,0.130
norm_words,0.0522,0.014,3.614,0.000,0.024,0.081
norm_backers_count,1.2394,0.008,163.448,0.000,1.225,1.254
serial_entrepreneur,-0.1078,0.013,-8.555,0.000,-0.133,-0.083
media,0.2681,0.017,15.379,0.000,0.234,0.302
sustainability,0.1230,0.026,4.736,0.000,0.072,0.174
category_comics,-0.1426,0.016,-8.719,0.000,-0.175,-0.111
category_fashion,-0.0548,0.023,-2.423,0.015,-0.099,-0.010

0,1,2,3
Omnibus:,480.113,Durbin-Watson:,1.95
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1284.014
Skew:,-0.313,Prob(JB):,1.51e-279
Kurtosis:,4.812,Cond. No.,40.5


In [20]:
formula = "norm_amount ~ norm_goal + norm_words + norm_backers_count + serial_entrepreneur + media + sustainability + category_comics + category_fashion + year_2021 + female_creator + norm_empathic + norm_empathic:female_creator"
empathicModel = smf.ols(formula, data=normData).fit()

empathicModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,3843.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:31,Log-Likelihood:,-5241.8
No. Observations:,8388,AIC:,10510.0
Df Residuals:,8375,BIC:,10600.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4837,0.038,12.697,0.000,0.409,0.558
norm_goal,0.1170,0.007,16.534,0.000,0.103,0.131
norm_words,0.0356,0.014,2.485,0.013,0.008,0.064
norm_backers_count,1.2416,0.008,164.164,0.000,1.227,1.256
serial_entrepreneur,-0.1052,0.013,-8.325,0.000,-0.130,-0.080
media,0.2696,0.017,15.470,0.000,0.235,0.304
sustainability,0.1225,0.026,4.719,0.000,0.072,0.173
category_comics,-0.1379,0.016,-8.475,0.000,-0.170,-0.106
category_fashion,-0.0568,0.023,-2.509,0.012,-0.101,-0.012

0,1,2,3
Omnibus:,476.24,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1287.82
Skew:,-0.306,Prob(JB):,2.26e-280
Kurtosis:,4.82,Cond. No.,40.0


In [21]:
formula = "norm_amount ~ norm_goal + norm_words + norm_backers_count + serial_entrepreneur + media + sustainability + category_comics + category_fashion + year_2021 + female_creator + norm_social + norm_social:female_creator"
socialModel = smf.ols(formula, data=normData).fit()

socialModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,3840.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:31,Log-Likelihood:,-5244.1
No. Observations:,8388,AIC:,10510.0
Df Residuals:,8375,BIC:,10610.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4849,0.043,11.188,0.000,0.400,0.570
norm_goal,0.1157,0.007,16.331,0.000,0.102,0.130
norm_words,0.0419,0.014,2.931,0.003,0.014,0.070
norm_backers_count,1.2412,0.008,163.028,0.000,1.226,1.256
serial_entrepreneur,-0.1070,0.013,-8.463,0.000,-0.132,-0.082
media,0.2701,0.018,15.405,0.000,0.236,0.304
sustainability,0.1218,0.026,4.690,0.000,0.071,0.173
category_comics,-0.1375,0.016,-8.420,0.000,-0.169,-0.105
category_fashion,-0.0538,0.023,-2.370,0.018,-0.098,-0.009

0,1,2,3
Omnibus:,481.969,Durbin-Watson:,1.95
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1291.256
Skew:,-0.314,Prob(JB):,4.05e-281
Kurtosis:,4.817,Cond. No.,72.1


In [22]:
formula = "norm_amount ~ norm_goal + norm_words + norm_backers_count + serial_entrepreneur + media + sustainability + category_comics + category_fashion + year_2021 + female_creator + norm_business + norm_business:female_creator + norm_empathic + norm_empathic:female_creator + norm_social + norm_social:female_creator"
combinedModel = smf.ols(formula, data=normData).fit()

combinedModel.summary()

0,1,2,3
Dep. Variable:,norm_amount,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,2884.0
Date:,"Sat, 05 Nov 2022",Prob (F-statistic):,0.0
Time:,22:08:31,Log-Likelihood:,-5237.6
No. Observations:,8388,AIC:,10510.0
Df Residuals:,8371,BIC:,10630.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4751,0.045,10.512,0.000,0.387,0.564
norm_goal,0.1173,0.007,16.497,0.000,0.103,0.131
norm_words,0.0459,0.015,3.086,0.002,0.017,0.075
norm_backers_count,1.2397,0.008,162.497,0.000,1.225,1.255
serial_entrepreneur,-0.1057,0.013,-8.352,0.000,-0.131,-0.081
media,0.2684,0.018,15.309,0.000,0.234,0.303
sustainability,0.1236,0.026,4.758,0.000,0.073,0.174
category_comics,-0.1425,0.016,-8.669,0.000,-0.175,-0.110
category_fashion,-0.0553,0.023,-2.431,0.015,-0.100,-0.011

0,1,2,3
Omnibus:,475.311,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1281.796
Skew:,-0.306,Prob(JB):,4.59e-279
Kurtosis:,4.815,Cond. No.,74.7


In [23]:
# Table 9
index = []
independentVariables = ['norm_business', 'norm_business:female_creator', 'norm_social', 'norm_social:female_creator', 'norm_empathic', 'norm_empathic:female_creator', 'female_creator'] + independentVariables[3:]
for variable in independentVariables:
    index.append(variable)
    index.append('')
index += ['constant', 'Adj. R-squared', 'df']
table9 = pd.DataFrame(index=index)

models = [baseModel, businessModel, socialModel, empathicModel, combinedModel]
cntr = 1
for model in models:
    variables = model.params.index.tolist()

    data = []
    for variable in independentVariables:
        if variable in variables:
            varIndex = variables.index(variable)
            data.append(('{:.%if}' % decimals).format(model.params.iloc[varIndex]))

            pValue = model.pvalues.iloc[varIndex]
            if pValue < 0.001:
                pValue = '< 0.001'
            else:
                pValue = ('{:.%if}' % decimals).format(pValue)
            data.append('(%s)' % pValue)
        else:
            data.append('')
            data.append('')

    data.append(('{:.%if}' % decimals).format(model.params.iloc[0]))
    data.append(('{:.%if}' % decimals).format(model.rsquared_adj))
    data.append(('{:.%if}' % decimals).format(model.df_model))

    table9['Model ' + str(cntr)] = data
    cntr += 1

table9.to_csv('Tables/OLS models (with interaction terms).csv')
table9

Unnamed: 0,Model 1,Model 2,Model 3,Model 4,Model 5
norm_business,,-0.029,,,-0.029
,,(0.006),,,(0.009)
norm_business:female_creator,,0.015,,,0.014
,,(0.435),,,(0.454)
norm_social,,,-0.008,,-0.011
,,,(0.744),,(0.671)
norm_social:female_creator,,,0.043,,0.029
,,,(0.354),,(0.547)
norm_empathic,,,,0.019,0.021
,,,,(0.123),(0.107)


In [24]:
# Robustness
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
def inCovidPeriod(project):
    if project['year_2020'] == 1:
        for month in months[2:]:
            if project['month_' + month] == 1:
                return True
    elif project['year_2021'] == 1:
        for month in months[:7]:
            if project['month_' + month] == 1:
                return True
    return False


def createModels(data_, subject):
    y_ = data_['norm_amount']
    models_ = []

    for additionVars in additionsVars:
        X_ = sm.add_constant(data_[baseVars + additionVars])
        models_.append(sm.OLS(y_, X_).fit())

    for additionFormula in additionsFormula:
        formula_ = baseFormula + additionFormula
        models_.append(smf.ols(formula_, data=data_).fit())

    index_ = []
    for variable_ in independentVariables:
        index_.append(variable_)
        index_.append('')
    index_ += ['constant', 'Adj. R-squared', 'df']
    table = pd.DataFrame(index=index_)

    cntr_ = 1
    for model_ in models_:
        variables_ = model_.params.index.tolist()

        vals = []
        for variable_ in independentVariables:
            if variable_ in variables_:
                varIndex_ = variables_.index(variable_)
                vals.append(('{:.%if}' % decimals).format(model_.params.iloc[varIndex_]))

                pValue_ = model_.pvalues.iloc[varIndex_]
                if pValue_ < 0.001:
                    pValue_ = '< 0.001'
                else:
                    pValue_ = ('{:.%if}' % decimals).format(pValue_)
                vals.append('(%s)' % pValue_)
            else:
                vals.append('')
                vals.append('')

        vals.append(('{:.%if}' % decimals).format(model_.params.iloc[0]))
        vals.append(('{:.%if}' % decimals).format(model_.rsquared_adj))
        vals.append(('{:.%if}' % decimals).format(model_.df_model))

        table['Model ' + str(cntr_)] = vals
        cntr_ += 1

    table.to_csv('Tables/OLS models (%s).csv' % subject)


def compareLists(list1, list2):
    if len(list1) == len(list2):
        comparison = []
        for x in range(len(list1)):
            comparison.append(list1[x] or list2[x])
        return comparison
    else:
        return []


baseVars = independentVariables[7:]
additionsVars = [[], ['norm_business'], ['norm_social'], ['norm_empathic'], ['norm_business', 'norm_social', 'norm_empathic']]
baseFormula = "norm_amount ~ norm_goal + norm_words + norm_backers_count + serial_entrepreneur + media + sustainability + category_comics + category_fashion + year_2021 + female_creator + "
additionsFormula = ['norm_business + norm_business:female_creator', 'norm_social + norm_social:female_creator', 'norm_empathic + norm_empathic:female_creator', 'norm_business + norm_business:female_creator + norm_empathic + norm_empathic:female_creator + norm_social + norm_social:female_creator']

covidData = normData.copy()
covidData['COVID'] = covidData.apply(lambda project: inCovidPeriod(project), axis=1)
covidData = covidData[covidData['COVID']]
covidData = covidData.drop(columns=['COVID'])
createModels(covidData, 'COVID')

comicsFashionData = normData[compareLists((normData['category_comics'] == 1).tolist(), (normData['category_fashion'] == 1).tolist())]
createModels(comicsFashionData, 'comics & fashion')