## Cleaning the company DataFrame

In [1]:
import pandas as pd

In [2]:
df_company = pd.read_csv('data/1_company_data.csv', index_col=0)

In [3]:
df_company.head()

Unnamed: 0,vat,name,address,zipcode,city,cityname,protected,phone,email,fax,...,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013
Dynatest A/S,20318872,A/S DYNATEST ENGINEERING,Generatorvej 21,2730,Herlev,,False,,,,...,3581.0,9606.0,4530.0,,,,,,,
"Eriksholm Research Centre, Oticon",50091015,PROPOLIS RESEARCH CENTRE A/S,Lille Veksøvej 11,3670,Veksø Sjælland,,False,,,,...,,1525319.0,165416.0,551594.0,962573.0,1011568.0,743559.0,730954.0,535275.0,798601.0
Formpipe,26366216,FORMPIPE LASERNET A/S,Lautrupvang 1,2750,Ballerup,,False,43660210.0,admin.lasernet@formpipe.com,,...,,36583.0,14719.0,7525.0,10511.0,-748.0,-4924.0,-4145.0,-590.0,-7751.0
Novo Nordisk,24256790,NOVO NORDISK A/S,Novo Alle 1,2880,Bagsværd,,False,44448888.0,,,...,,47523000.0,42159000.0,39252000.0,38530000.0,37873000.0,37791000.0,34623000.0,26413000.0,25190000.0
PFA,31050162,PFA BANK A/S,Sundkrogsgade 4,2100,København Ø,,False,70808500.0,post@pfabank.dk,39172950.0,...,-511000.0,-79000.0,108000.0,10000.0,92000.0,181000.0,136000.0,-587000.0,548000.0,224000.0


In [4]:
# Amount of non-nan values per column.
df_company.count()

vat                25
name               25
address            25
zipcode            25
city               25
cityname            1
protected          25
phone              16
email              14
fax                 5
startdate          25
enddate             1
employees          23
addressco           4
industrycode       25
industrydesc       25
companycode        25
companydesc        25
creditstartdate     1
creditbankrupt     25
creditstatus        1
owners              4
productionunits    25
t                  25
version            25
2022                8
2021               23
2020               21
2019               20
2018               19
2017               18
2016               17
2015               17
2014               17
2013               16
dtype: int64

In [5]:
# specifically the missing values of the financial data
df_company.loc[:,'2022':].isna().sum()

2022    17
2021     2
2020     4
2019     5
2018     6
2017     7
2016     8
2015     8
2014     8
2013     9
dtype: int64

In [6]:
df_company.drop('2022', axis=1, inplace=True)

In [7]:
no_rows = df_company.shape[0]
df_company.dropna(axis=1, thresh=no_rows * 0.1, inplace=True)   # drops columns with 90 % missing data

In [8]:
def all_same(series):
    return (series == series[0]).all()

cols_with_same_vals = [col for col in df_company.columns if col != 'productionunits' and all_same(df_company[col])]
print(cols_with_same_vals)
df_company.drop(cols_with_same_vals, axis=1, inplace=True)

['t', 'version']


In [9]:
# drop various columns I don't want
df_company.drop(['protected', 'fax', 'companycode', 'companydesc', 'owners'], axis=1, inplace=True)

In [10]:
# removes floor of addresses to format for geo coordinates search.
df_company['address'] = df_company['address'].apply(lambda a: a.split(',')[0])

In [11]:
df_company.to_csv('data/2_cleaned_data.csv')