In [None]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols as sm_ols
from statsmodels.iolib.summary2 import summary_col

In [None]:
# load the data for startups
startups = (pd.read_csv('Startup Data/startup_raw.csv'))
startups 

In [None]:
# filter to include only the years that have a 100% survival rate to show how many startups were made that year
startups_started = startups[startups['Survival Rates Since Birth (%)'] == 100]

# making the year column an integer for merge
startups_started = startups_started.rename(columns={'Year Ended' : 'year'})
startups_started = startups_started.rename(columns={'Surviving Establishments' : 'Surviving_Establishments'})
                                                          
startups_started

In [None]:
# Filtering the startups to only have the year and number of establishments

selected_columns = ['year', 'Surviving_Establishments']
startups_numbers  = startups_started[selected_columns]
startups_numbers.to_csv('Startup Data/startup_numbers.csv', index=False)

In [None]:
# Load work from home dataset

WFH_Data = pd.read_csv('Work From Home Data/WFH 1965-present.csv')
#WFH_Data.head(50)
WFH_Data = WFH_Data.drop(columns = ['License', 'Citation'])

In [None]:
# converting the year into a format that can be merged on
WFH_Data['date'] = pd.to_datetime(WFH_Data['date'])
WFH_Data['year'] = WFH_Data['date'].dt.strftime('%Y')
WFH_Data.tail(55)

In [None]:
# only need years 2015 onward
filtered_WFH = WFH_Data.iloc[18:23]
filtered_WFH

In [None]:
WFH_Data.loc[len(WFH_Data.index)] = ['NaN', "%.1f"%WFH_Data.loc[WFH_Data['year']=='2020', 'WFH_share'].mean(), 'NaN', 'NaN', 'NaN', 'NaN', 2020 ] 
WFH_Data.loc[len(WFH_Data.index)] = ['NaN', "%.1f"%WFH_Data.loc[WFH_Data['year']=='2021', 'WFH_share'].mean(), 'NaN', 'NaN', 'NaN', 'NaN', 2021 ] 
WFH_Data.loc[len(WFH_Data.index)] = ['NaN', "%.1f"%WFH_Data.loc[WFH_Data['year']=='2022', 'WFH_share'].mean(), 'NaN', 'NaN', 'NaN', 'NaN', 2022 ] 
WFH_Data.loc[len(WFH_Data.index)] = ['NaN', "%.1f"%WFH_Data.loc[WFH_Data['year']=='2023', 'WFH_share'].mean(), 'NaN', 'NaN', 'NaN', 'NaN', 2023 ] 
WFH_Data.loc[len(WFH_Data.index)] = ['NaN', "%.1f"%WFH_Data.loc[WFH_Data['year']=='2024', 'WFH_share'].mean(), 'NaN', 'NaN', 'NaN', 'NaN', 2024 ] 
WFH_Data

In [None]:
WFH_Data.tail(10)

In [None]:
filtered_WFH = filtered_WFH.reset_index()

In [None]:
filtered_WFH = pd.concat([filtered_WFH, WFH_Data.loc[WFH_Data['year'] == 2020]])
filtered_WFH = pd.concat([filtered_WFH, WFH_Data.loc[WFH_Data['year'] == 2021]])
filtered_WFH = pd.concat([filtered_WFH, WFH_Data.loc[WFH_Data['year'] == 2022]])
filtered_WFH = pd.concat([filtered_WFH, WFH_Data.loc[WFH_Data['year'] == 2023]])
# filtered_WFH = pd.concat([filtered_WFH, WFH_Data.loc[WFH_Data['year'] == 2024]])
filtered_WFH = filtered_WFH.drop_duplicates()
filtered_WFH

In [None]:
filtered_WFH['year'].astype

In [None]:
filtered_WFH.to_csv('Work From Home Data/cleaned_WFH.csv',index=False)

In [None]:
# Merge our Datasets

wfh_startups = pd.merge(filtered_WFH,
                        startups_numbers,
                        on = ['year'],
                        how = 'left',
                        validate = '1:1')
wfh_startups

In [None]:
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col  # Importing summary_col function
# regression model to show the relationship between the WFH share of the workforce and the number of startups created
# Run OLS
m1 = sm.OLS.from_formula('Surviving_Establishments ~ WFH_share', data=wfh_startups).fit()
m2 = sm.OLS.from_formula('Surviving_Establishments ~ np.log(WFH_share)', data=wfh_startups).fit()  # Use np.log for log transformation
m3 = sm.OLS.from_formula('np.log(Surviving_Establishments) ~ WFH_share', data=wfh_startups).fit()  # Log transform dependent variable

# Print out multiple regression results at once
table = summary_col(results=[m1, m2, m3],
                    float_format='%0.2f',
                    stars=True,
                    model_names=['m1', 'm2', 'm3'],
                    info_dict=None)  # You can pass additional information here if needed

print(table)


In [None]:
# Run OLS
m1 = sm_ols('wfh% ~ startups#', data=wfh_startups).fit()
m2 = sm_ols('wfh% ~ log_startups#', data=wfh_startups).fit()
m3 = sm_ols('log_wfh% ~ startups#', data=wfh_startups).fit()

# Print out multiple regression results at once
table = summary_col(results=[m1,m2,m3],
                    float_format='%0.2f',
                    stars = True,
                    model_names=['m1','m2','m3'],
                    info_dict=info_dict)

print(table)


In [None]:
lineplot1 = sns.lineplot(data = wfh_startups,
             x='wfh%',y='startups',hue='high_leverage') #style='gsector'

In [None]:
lineplot2 = sns.lineplot(data = wfh_startups,
             x='wfh%',y='log_startups',hue='high_leverage') #style='gsector'

In [None]:
lineplot3 = sns.lineplot(data = wfh_startups,
             x='log_wfh%',y='startups',hue='high_leverage') #style='gsector'