## Extract company name

In [1]:
import pandas as pd
import re
data = pd.read_csv("article_after_processing1.csv", encoding='iso-8859-1')

In [2]:
data.shape

(646, 6)

In [3]:
# get a company name
def get_company(item):
    regex = re.compile(r"(\w*\s*\w+.{0,1})(,.+,)*\s(raise|land|grab|step|receive|get|collect|pull|close|secure|take|tap|score|snare|snag|grab|nab|win)", re.IGNORECASE)
    matched = re.search(regex, item)
    if matched:
        return matched.group(1)
    else:
        regex2 = re.compile(r"(marketplace|firm|startup|company|competitor|platform|network|service|leader|house|developer|retailer)\s(\w+\s*\w*)", re.IGNORECASE)
        matched2 = re.search(regex2, item)
        if matched2:
            return matched2.group(2)
        else:
            regex3 = re.compile(r"^(\w+\s\w+|\w+)\s\w+s\s", re.IGNORECASE)
            matched3 = re.search(regex3, item)
            if matched3:
                return matched3.group(1)
            else:
                regex4 = re.compile(r"(\w+|\w+\s\w+),\s", re.IGNORECASE)
                matched4 = re.search(regex4, item)
                if matched4:
                    return matched4.group(1)
                else:
                    return "NA"

data["Temp_company"] = data.title.apply(lambda item: get_company(item))

In [4]:
# get doc.com company name (overwrite)
def get_dot_com(item):
    regex = re.compile(r'\w+\.com')
    matched = re.search(regex, item)
    if matched:
        return matched.group(0)
    else:
        pass
data["Dot.company"] = data.title.apply(lambda item: get_dot_com(item))

In [5]:
# combine "Company" column and "Dot.company" column
data["Company"] = data["Dot.company"].fillna(data["Temp_company"])

## Clean up "Company" column

In [6]:
# step.0 remove " s" in "s Company Name"
# step.2 validate companies with very short name

In [7]:
sum(data.Company != "NA")

621

In [8]:
# remove Company != "NA"
data = data[data.Company != "NA"]

In [9]:
# drop the columns
data.drop(['Temp_company', 'Dot.company'], axis = 1, inplace = True)

In [10]:
# remove some words from company name
regex = re.compile(r"\s*(marketplace|firm|startup|company|service|maker|network|leader|solution|competitor|platform|developer|retailer)\s*", re.IGNORECASE)
data["Company"] = data.Company.apply(lambda item: re.sub(regex, '',item))

In [11]:
# remove " s" in "s Company Name"
data.Company = data.Company.apply(lambda company: re.sub(r'^s\s|^\s', '',company))

In [12]:
# validate companies with very short name
mask = data.Company.apply(lambda company: True if len(company) < 3 else False)
data.loc[mask].shape

(17, 7)

In [13]:
data.loc[mask]

Unnamed: 0,title,link,excerpt,published_at,funding_round,money_raised,Company
83,"Harry?s Raises $75.6 Million In Series C, Valu...",https://techcrunch.com/2015/07/07/harrys-raise...,"Harry's, the direct-to-consumer mens' razor co...",2015/07/07,Series C,$75.6 M,s
107,GE spin-out SmartAssist.io raises $5M Series A...,https://techcrunch.com/2017/08/03/ge-spin-out-...,"Last November, GE acquired the AI-centric star...",2017/08/03,Series A,$5 M,io
119,Care/of raises $12 million in Series A for its...,https://techcrunch.com/2017/07/12/careof-raise...,"Care/of, a year-old, New York-based company th...",2017/07/12,Series A,$12 M,of
191,"Don?t Launch A Company, Launch A Fund (Or The ...",https://techcrunch.com/2011/11/04/dont-launch-...,Editor?s note: This guest post is authored by ...,2011/11/04,Series A,unknown,A
276,Callstats.io raises $3M Series A round for its...,https://techcrunch.com/2016/09/06/callstats-io...,"WebRTC, a relatively new web standard that all...",2016/09/06,Series A,$3 M,io
290,Frame.io raises a $10M Series A with Jared Let...,https://techcrunch.com/2016/09/14/frame-io-rai...,Cloud-based collaboration tool Frame.io has ra...,2016/09/14,Series A,$10 M,io
307,Panoply.io raises $7M Series A for its data an...,https://techcrunch.com/2016/08/03/panoply-io-r...,"Panoply.io, a startup that wants to make setti...",2016/08/03,Series A,$7 M,io
359,Iron.io Raises $8M Series A Round For Its Ente...,https://techcrunch.com/2015/09/16/iron-io-rais...,Iron.io today announced that it has raised a $...,2015/09/16,Series A,$8 M,io
392,"Harryâ??s Raises $75.6 Million In Series C, Va...",https://techcrunch.com/2015/07/07/harrys-raise...,"Harry's, the direct-to-consumer mens' razor co...",2015/07/07,Series C,$75.6 M,s
437,"Donâ??t Launch A Company, Launch A Fund (Or Th...",https://techcrunch.com/2011/11/04/dont-launch-...,Editorâ??s note: This guest post is authored b...,2011/11/04,Series A,unknown,A


In [14]:
def get_company_name_for_io(row):
    if len(row['Company']) < 3:
        search = re.search(r'\w+\.io', row['title'])
        if search:
            return search.group(0)
        else:
            return row['Company']    
    else:
        return row['Company']
    
data["Company"] = data.apply(lambda row: get_company_name_for_io(row), axis = 1)

In [15]:
data = data.reset_index(drop=True)

In [16]:
mask = data.Company.apply(lambda company: True if len(company) < 3 else False)
data.loc[mask].shape

(9, 7)

In [17]:
data.loc[mask]

Unnamed: 0,title,link,excerpt,published_at,funding_round,money_raised,Company
83,"Harry?s Raises $75.6 Million In Series C, Valu...",https://techcrunch.com/2015/07/07/harrys-raise...,"Harry's, the direct-to-consumer mens' razor co...",2015/07/07,Series C,$75.6 M,s
118,Care/of raises $12 million in Series A for its...,https://techcrunch.com/2017/07/12/careof-raise...,"Care/of, a year-old, New York-based company th...",2017/07/12,Series A,$12 M,of
185,"Don?t Launch A Company, Launch A Fund (Or The ...",https://techcrunch.com/2011/11/04/dont-launch-...,Editor?s note: This guest post is authored by ...,2011/11/04,Series A,unknown,A
379,"Harryâ??s Raises $75.6 Million In Series C, Va...",https://techcrunch.com/2015/07/07/harrys-raise...,"Harry's, the direct-to-consumer mens' razor co...",2015/07/07,Series C,$75.6 M,s
418,"Donâ??t Launch A Company, Launch A Fund (Or Th...",https://techcrunch.com/2011/11/04/dont-launch-...,Editorâ??s note: This guest post is authored b...,2011/11/04,Series A,unknown,A
458,French online restaurant FoodChÃ©ri raises â?¬...,https://techcrunch.com/2016/09/19/french-onlin...,"French food delivery startup FoodChÃ©ri, which...",2016/09/19,Series A,¬6 M,ri
486,Virtual nurse app Sense.ly raises $8 million f...,https://techcrunch.com/2017/02/14/virtual-nurs...,San Francisco startup Sense.ly has raised $8 m...,2017/02/14,Series B,$8 M,ly
505,Price f(x) raises â?¬4M Series A for pricing o...,https://techcrunch.com/2016/12/20/price-fx/,"Price f(x), a pricing optimization SaaS, has r...",2016/12/20,Series A,¬4 M,x)
534,Upload VR startup raised $4.5 million Series A...,https://techcrunch.com/2017/05/16/upload-vr-st...,"Upload, which shiftedÂ the ""UploadVR"" brandÂ t...",2017/05/16,Series A,$4.5 M,VR


In [18]:
data.loc[83].Company = "Harry's"
data.loc[118].Company = "Care/of"
data.loc[486].Company = "Sense.ly"
data.loc[505].Company = "Price f(x)"
data.loc[534].Company = "Upload"

In [19]:
data.drop(data.index[[185,379, 418, 458, 486]], inplace = True)

In [20]:
mask = data.Company.apply(lambda company: True if len(company) < 3 else False)
data.loc[mask].shape

(0, 7)

In [21]:
data.to_csv("article_after_processing2.csv", index = False)

In [22]:
data.shape

(616, 7)