## Clean up "money_raised" column

In [1]:
# Clean up before getting a company profile for each company from linkedin
# step.0 clean up currency --- still abit remained
# step.1 check funding_round is NA

In [2]:
import pandas as pd
import re
data = pd.read_csv("article_after_processing2.csv", encoding='iso-8859-1')

In [3]:
data.shape

(526, 7)

In [4]:
def convert_pond_to_dollar(money):
    match = re.search(r'^\£(\w+\.*\w*)\s(\w)', money)
    if match:
        calc = float(match.group(1)) * 1.29
        return "${} {}".format(str(round(calc, 1)), match.group(2))
    else:
        return money
    
data["money_raised"] = data.money_raised.apply(lambda money: convert_pond_to_dollar(money))

In [5]:
def convert_euro_to_dollar(row):
    if re.search(r'^\$|unknown', row["money_raised"]):
        return row["money_raised"]  
    else:
        if re.search(
            r'Europ|French|German|Finnish|Dutch',row["title"]
        ) or re.search(
            r'Europ|German|Finnish|Dutch', row["excerpt"]
        ):
            match = re.search(r'\d+\.*\d*', row["money_raised"])
            calc = float(match.group(0)) * 1.19
            return "${} {}".format(str(round(calc, 1)), "M")
        else:
            return row["money_raised"]
            
    
data["money_raised"] = data.apply(lambda row: convert_euro_to_dollar(row),
                                  axis = 1)

In [6]:
# confirm the change
data.head()

Unnamed: 0,title,link,excerpt,published_at,funding_round,money_raised,Company
0,Indian online lending platform Capital Float r...,https://techcrunch.com/2017/08/21/indian-onlin...,"Capital Float, which claims to be the largest ...",2017/08/21,Series C,$45 M,Capital Float
1,"Carwow, a UK startup that helps you buy a new ...",https://techcrunch.com/2017/07/30/carwow-serie...,"Carwow, a platform that helps you buy a new ca...",2017/07/30,Series C,$39 M,Carwow
2,Stash raises $40 million Series C to make inve...,https://techcrunch.com/2017/07/13/stash-raises...,Micro-investing app Stash has raised an additi...,2017/07/13,Series C,$40 M,Stash
3,"Blowing up the re-location industry, Move Guid...",https://techcrunch.com/2017/07/11/blowing-up-t...,Whatever any country leader says about banning...,2017/07/11,Series C,$48 M,Move Guides
4,Cohesity raises $90M+ Series C round for its h...,https://techcrunch.com/2017/04/04/cohesity-rai...,"Cohesity, a company that helps enterprises to ...",2017/04/04,Series C,$90 M,Cohesity


In [7]:
# inspect companies that raise more than 10M. there are only five
mask = data.money_raised.apply(
    lambda money: False if re.search(r'^\$|unknown', money) else True)
data.loc[mask][["title", "money_raised", "Company"]]

Unnamed: 0,title,money_raised,Company
123,ANA?s new C-3PO jet is fluent in over 6 millio...,6 M,ANA?s new C-3PO jet
189,Leica's C-LUX 2: 7.2 Megapixels in Your Pocket,7.2 M,Leica's C-7.2 Megapixels in Your Pocket
202,TiVo Series 3 Modded to 1TB,3 M,TiVo
229,The Facebook Button Is Back On Phones: Nokia P...,2.9 M,205 Dual-SIM Qwerty Handsets
267,"BranchOut Hits 25 Million Users, Nabs $25M In ...",25 M,"BranchOut Hits 25 Million Users,"
275,Kids? Game Moshi Monsters Set To Leap Onto The...,x11 M,Kids? Game Moshi Monsters Set To Leap Onto The...
310,Index Ventures Leads ?10M Series C In Online P...,?10 M,Navabi
361,ANAâ??s new C-3PO jet is fluent in over 6 mill...,6 M,ANAâ??s new C-3PO jet
376,Kidsâ?? Game Moshi Monsters Set To Leap Onto T...,x11 M,Kidsâ?? Game Moshi Monsters Set To Leap Onto T...
379,Index Ventures Leads â?¬10M Series C In Online...,¬10 M,Navabi


In [8]:
data.set_value(267, "Company", "BranchOut")
data.set_value(267, "money_raised", "$25 M")
data.set_value(275, "money_raised", "$11 M")
data.set_value(310, "money_raised", "$10 M")

Unnamed: 0,title,link,excerpt,published_at,funding_round,money_raised,Company
0,Indian online lending platform Capital Float r...,https://techcrunch.com/2017/08/21/indian-onlin...,"Capital Float, which claims to be the largest ...",2017/08/21,Series C,$45 M,Capital Float
1,"Carwow, a UK startup that helps you buy a new ...",https://techcrunch.com/2017/07/30/carwow-serie...,"Carwow, a platform that helps you buy a new ca...",2017/07/30,Series C,$39 M,Carwow
2,Stash raises $40 million Series C to make inve...,https://techcrunch.com/2017/07/13/stash-raises...,Micro-investing app Stash has raised an additi...,2017/07/13,Series C,$40 M,Stash
3,"Blowing up the re-location industry, Move Guid...",https://techcrunch.com/2017/07/11/blowing-up-t...,Whatever any country leader says about banning...,2017/07/11,Series C,$48 M,Move Guides
4,Cohesity raises $90M+ Series C round for its h...,https://techcrunch.com/2017/04/04/cohesity-rai...,"Cohesity, a company that helps enterprises to ...",2017/04/04,Series C,$90 M,Cohesity
5,Digg raises Series C led by USA Today owner Ga...,https://techcrunch.com/2016/09/13/digg-raises-...,As publishers of dead-tree newspapers continue...,2016/09/13,Series C,unknown,Digg
6,"After bump in the road, Movinga raises ?17M Se...",https://techcrunch.com/2016/12/08/on-the-movin...,Berlin's startup scene is probably the most go...,2016/12/08,Series C,$20.2 M,Movinga
7,Leanplum raises $29M Series C round for its mo...,https://techcrunch.com/2016/10/19/leanplum-rai...,Leanplum started out as a mobile A/B testing p...,2016/10/19,Series C,$29 M,Leanplum
8,"Forter raises $32M Series C, continues battle ...",https://techcrunch.com/2016/04/21/forter-serie...,Fraud prevention company Forter today announce...,2016/04/21,Series C,$32 M,Forter
9,MemSQL raises $36M Series C round for its in-m...,https://techcrunch.com/2016/04/21/memsql-raise...,In-memory database platform MemSQL today annou...,2016/04/21,Series C,$36 M,MemSQL


In [9]:
# drop the rest
mask = data.money_raised.apply(
    lambda money: False if re.search(r'^\$|unknown', money) else True)
data.drop(data.loc[mask].index, inplace = True)

In [10]:
# confirm the change
data.loc[mask][["title", "money_raised", "Company"]]

Unnamed: 0,title,money_raised,Company


In [11]:
# there are 24 "unknown" for "money_raised"
sum((data.money_raised == "unknown"))

24

In [12]:
# drop unknowns
data.drop(data.index[(data.money_raised == "unknown")], inplace = True)

In [13]:
# confirm the transformation
data[(data.money_raised == "unknown")].shape

(0, 7)

In [14]:
def digitize_money_raised(money):
    match = re.search(r'\$(\d+\.*\d*)', money)
    if match:
        return float(match.group(1))
    else:
        return money
    
data["money_raised_float"] = data.money_raised.apply(lambda money: digitize_money_raised(money))

### Now we have clean data for the column "money_raised"

In [15]:
# check funding_round is NA
# keep them for now
data[(data.funding_round.isnull())].shape

(31, 8)

### Before moving on, delete both companies in round A and companies raised less than \$10M

In [16]:
mask = (data.funding_round == "Series A") | (data.money_raised_float < 10)
data.drop(data.loc[mask].index, inplace = True)

In [20]:
# confirm the change
mask = (data.funding_round == "Series A") | (data.money_raised_float < 10)
sum(mask)

0

In [17]:
data.to_csv("article_after_processing3.csv", index = False)