## Clean up "money_raised" column

In [1]:
# Clean up before getting a company profile for each company from linkedin
# step.0 clean up currency --- still abit remained
# step.1 check funding_round is NA

In [2]:
import pandas as pd
import re
data = pd.read_csv("article_after_processing2.csv", encoding='iso-8859-1')

In [3]:
data.shape

(616, 7)

In [4]:
def convert_pond_to_dollar(money):
    match = re.search(r'^\£(\w+\.*\w*)\s(\w)', money)
    if match:
        calc = float(match.group(1)) * 1.29
        return "${} {}".format(str(round(calc, 1)), match.group(2))
    else:
        return money
    
data["money_raised"] = data.money_raised.apply(lambda money: convert_pond_to_dollar(money))

In [5]:
def convert_euro_to_dollar(row):
    if re.search(r'^\$|unknown', row["money_raised"]):
        return row["money_raised"]  
    else:
        if re.search(r'Europ|French|German|Finnish|Dutch', row["title"]) or re.search(r'Europ|German|Finnish|Dutch', row["excerpt"]):
            match = re.search(r'\d+\.*\d*', row["money_raised"])
            calc = float(match.group(0)) * 1.19
            return "${} {}".format(str(round(calc, 1)), "M")
        else:
            return row["money_raised"]
            
    
data["money_raised"] = data.apply(lambda row: convert_euro_to_dollar(row), axis = 1)

In [6]:
# inspect companies that raise more than 10M. there are only four
mask = data.money_raised.apply(
    lambda money: False if re.search(r'^\$|unknown', money) else True)
data.loc[mask]

Unnamed: 0,title,link,excerpt,published_at,funding_round,money_raised,Company
196,TiVo Series 3 Modded to 1TB,https://techcrunch.com/2006/10/17/tivo-series-...,If the 250GB of storage on your Series 3 TiVo ...,2006/10/17,,3 M,TiVo
222,The Facebook Button Is Back On Phones: Nokia P...,https://techcrunch.com/2012/11/26/the-facebook...,"Amid all the Windows Phone, luridly coloured L...",2012/11/26,,2.9 M,Asha 205
258,"BranchOut Hits 25 Million Users, Nabs $25M In ...",https://techcrunch.com/2012/04/19/branchout-25...,BranchOut is officially going for the big time...,2012/04/19,Series C,25 M,"Million Users,"
301,Index Ventures Leads ?10M Series C In Online P...,https://techcrunch.com/2013/12/02/index-ventur...,Index Ventures has led a ?10 million Series C ...,2013/12/02,Series C,?10 M,Index Ventures
457,Index Ventures Leads â?¬10M Series C In Online...,https://techcrunch.com/2013/12/02/index-ventur...,Index Ventures has led aÂ â?¬10 million Series...,2013/12/02,Series C,¬10 M,Index Ventures
488,Social Media Curation For Media Firms Crowdyne...,https://techcrunch.com/2015/01/12/social-media...,"Crowdynews, a social media curation platform a...",2015/01/12,Series A,¬2.5 M,Crowdynews
500,Price f(x) raises â?¬4M Series A for pricing o...,https://techcrunch.com/2016/12/20/price-fx/,"Price f(x), a pricing optimization SaaS, has r...",2016/12/20,Series A,¬4 M,Price f(x)
580,Fuji adds even more colors of the rainbow to t...,https://techcrunch.com/2008/01/24/fuji-adds-ev...,"For the fun, zany, quirky folks comes the Fine...",2008/01/24,,45 M,Fuji


In [7]:
data.loc[580] # drop this becuase this article is not about startup

title            Fuji adds even more colors of the rainbow to t...
link             https://techcrunch.com/2008/01/24/fuji-adds-ev...
excerpt          For the fun, zany, quirky folks comes the Fine...
published_at                                            2008/01/24
funding_round                                                  NaN
money_raised                                                  45 M
Company                                                       Fuji
Name: 580, dtype: object

In [8]:
data.loc[457] # this is same to loc[301]. drop this.

title            Index Ventures Leads â?¬10M Series C In Online...
link             https://techcrunch.com/2013/12/02/index-ventur...
excerpt          Index Ventures has led aÂ â?¬10 million Series...
published_at                                            2013/12/02
funding_round                                             Series C
money_raised                                                 ¬10 M
Company                                             Index Ventures
Name: 457, dtype: object

In [9]:
data.loc[258]

title            BranchOut Hits 25 Million Users, Nabs $25M In ...
link             https://techcrunch.com/2012/04/19/branchout-25...
excerpt          BranchOut is officially going for the big time...
published_at                                            2012/04/19
funding_round                                             Series C
money_raised                                                  25 M
Company                                             Million Users,
Name: 258, dtype: object

In [10]:
data.loc[258].Company = "BranchOut"
data.loc[258].money_raised = "$25 M"

In [11]:
data.loc[301]

title            Index Ventures Leads ?10M Series C In Online P...
link             https://techcrunch.com/2013/12/02/index-ventur...
excerpt          Index Ventures has led a ?10 million Series C ...
published_at                                            2013/12/02
funding_round                                             Series C
money_raised                                                 ?10 M
Company                                             Index Ventures
Name: 301, dtype: object

In [12]:
data.loc[301].Company = "Navabi"
data.loc[301].money_raised = "$12 M"

In [13]:
# drop the rest
mask = data.money_raised.apply(
    lambda money: False if re.search(r'^\$|unknown', money) else True)
data.drop(data.loc[mask].index, inplace = True)

In [15]:
# there are 19 "unknown" for "money_raised"
sum((data.money_raised == "unknown"))

19

In [16]:
# drop unknowns
data.drop(data.index[(data.money_raised == "unknown")], inplace = True)

In [17]:
# confirm the transformation
data[(data.money_raised == "unknown")].shape

(0, 7)

In [18]:
def digitize_money_raised(money):
    match = re.search(r'\$(\d+\.*\d*)', money)
    if match:
        return float(match.group(1))
    else:
        return money
    
data["money_raised_float"] = data.money_raised.apply(lambda money: digitize_money_raised(money))

### Now we have clean data for the column "money_raised"

In [19]:
# check funding_round is NA
data[(data.funding_round.isnull())].shape

(34, 8)

In [20]:
data.to_csv("article_after_processing3.csv", index = False)