In [1]:
# re-classify industries
# create new features for industries using specialties

In [2]:
import re
import pandas as pd
data = pd.read_csv("article_after_processing8.csv", encoding='iso-8859-1')

In [3]:
data.Industry.unique()

array(['Financial Services', 'Information Technology and Services',
       'Human Resources', 'Computer Software',
       'Logistics and Supply Chain', 'Internet',
       'Computer & Network Security', 'Food & Beverages',
       'Marketing and Advertising', 'Medical Devices', 'E-Learning',
       'Consumer Services', 'Sports', 'Consumer Electronics',
       'Computer Hardware', 'Education Management', 'Apparel & Fashion',
       'Entertainment', 'Consumer Goods', 'Biotechnology',
       'Management Consulting', 'Real Estate', 'Fund-Raising',
       'Commercial Real Estate', 'Food Production', 'Online Media',
       'Mechanical or Industrial Engineering', 'Renewables & Environment',
       'Farming', 'Electrical/Electronic Manufacturing',
       'Leisure, Travel & Tourism', 'Sporting Goods', 'Retail',
       'Semiconductors', 'Cosmetics', 'Staffing and Recruiting',
       'Insurance', 'Telecommunications', 'Health, Wellness and Fitness',
       'Textiles', 'Nanotechnology', 'Luxury Good

In [4]:
data.groupby("Industry").title.count()

Industry
Apparel & Fashion                        2
Biotechnology                            3
Commercial Real Estate                   1
Computer & Network Security              4
Computer Hardware                        2
Computer Software                       48
Consumer Electronics                     4
Consumer Goods                           2
Consumer Services                        2
Cosmetics                                1
E-Learning                               3
Education Management                     1
Electrical/Electronic Manufacturing      1
Entertainment                            4
Farming                                  1
Financial Services                      13
Food & Beverages                         2
Food Production                          2
Fund-Raising                             1
Health, Wellness and Fitness             3
Human Resources                          2
Information Technology and Services     25
Insurance                                1
In

In [5]:
def rewrite_industry(industry):
    
    if industry in ['Apparel & Fashion', 
                    'Consumer Goods',
                    'Consumer Services',
                    'Cosmetics',
                    'Luxury Goods & Jewelry',
                    'Retail',
                    'Leisure, Travel & Tourism',
                    'Sporting Goods',
                    'Textiles']:
        return 'Consumers Goods & Services'

    elif industry in ['Computer Hardware', 'Computer & Network Security']:
        return 'Computer & Network Security & Hardware'
    
    elif industry in ['E-Learning', 'Education Management']:
        return 'Education'
    
    elif industry in ['Farming',
                      'Food & Beverages',
                      'Food Production',
                      'Restaurants']:
        return 'Food Business'
    
    elif industry in ['Insurance',
                      'Fund-Raising',
                      'Financial Services']:
        return 'Financial Services'
    
    elif industry in ['Internet',
                      'Online Media']:
        return 'Internet'

    elif industry in ['Commercial Real Estate', 
                      'Real Estate']:
        return 'Real Estate'
    
    elif industry in ['Health, Wellness and Fitness', 'Medical Devices'
                     ,'Sports']:
        return 'Healthcare_health'

    elif industry in ['Human Resources', 
                      'Staffing and Recruiting']:
        return 'Human Resources'
    
    elif industry in ['Telecommunications',
                      'Renewables & Environment',
                      'Logistics and Supply Chain']:
        return "Infrastructure"
    
    elif industry in ['Semiconductors', 'Nanotechnology', 'Biotechnology', 
                      'Management Consulting',
                      'Electrical/Electronic Manufacturing',
                     'Mechanical or Industrial Engineering']:
        return 'Niche'
        
    else:
        return industry
    
data["Industry_consolidated"] = data.Industry.apply(
    lambda industry: rewrite_industry(industry))

In [6]:
data.groupby("Industry_consolidated").title.count()

Industry_consolidated
Computer & Network Security & Hardware     6
Computer Software                         48
Consumer Electronics                       4
Consumers Goods & Services                14
Education                                  4
Entertainment                              4
Financial Services                        15
Food Business                              5
Healthcare_health                          6
Human Resources                            4
Information Technology and Services       25
Infrastructure                            10
Internet                                  68
Marketing and Advertising                  6
Niche                                      8
Real Estate                                5
Name: title, dtype: int64

In [7]:
def check_keywords(keyword_list, row):
    regex = re.compile(r'{}'.format("|".join(keyword_list) ), flags = re.I)
    if not isinstance(row["Specialties"], float):
#         print("print1")
        match = re.search(regex, str(row["Specialties"]))
        if match:
            return 1
        else:
            match = re.search(regex, str(row["Description"]))
            if match:
                return 1
            else:
                return 0
    else:
#         print("print2")
        match = re.search(regex, str(row["Description"]))
        if match:
            return 1
        else:
            return 0


In [8]:
def create_columns(dct, dataframe):
    for key, keywords in dct.items():
        dataframe["spc_{}".format(key)] = dataframe.apply(
            lambda row: check_keywords(
            keyword_list = keywords, row = row), axis = 1
        )
    return dataframe

In [9]:
key_words_dict = {
    "Food Business": ["restaurant", "farm", "greenhouse", "Gastronomie"],
    "Education": ["Online Learning", "Education", "Tutor"],
    "Financial Services": ["payment", "loan", "financ", "fundraising", 
               "investing", "lending"],
    "Healthcare_health": ["healthcare", "medical", "genetic", "therapy", "disease",
                  "fitness", "wellness", "welfare","wearable", "gym"],
    "Human Resources": ["recruit", "workforce", "Human Resource"],
    "Logistics and Supply Chain": ["delivery", "drone", 
                                   "transportation", "supply chain"],
    "Entertainment": ["entertainment", "game"],
    "Computer & Network Security & Hardware": ["storage","backup", "recovery", 
                                               "privacy"],
    "Real_estate": ["Real Estate"],
    "Marketing and Advertising": ["marketing", "advertising", "advertisement"],
    
    "commerce": ["eCommerce", "Commerce", "Retail"],
    "mobile" : ["mobile"],
    "app": ["mobile app", "app\s"],
    "analysis": ["analytics", "analysis"],
    "developer": ["developer"],
    "security" : ["fraud", "detection", "protection"],
    "social": ["Social Media"],
    "ds": ["artificial intelligence", "machine learning", 
          "deep learning", "big data"],
    "travel": ["Travel"],
    "booking_ticketing": ["booking", "ticket"],
    "Apparel": ["fashion", "clothing", "shoes", "Sporting Goods"],
    "cloud": ["cloud"],
    "API": ["API"],
    "device": ["device"],
    "design": ["design"],
    "enterprise": ["enterprise", "productivity", "collaboration"],
    "robotics_manufacturing": ["Manufact", "robotics", "3d"]
 }

In [10]:
data = create_columns(key_words_dict, data)

In [11]:
data.loc[:,"spc_Food Business":"spc_robotics_manufacturing"].apply(
    lambda row: sum(row), axis = 0)

spc_Food Business                              8
spc_Education                                 10
spc_Financial Services                        36
spc_Healthcare_health                         24
spc_Human Resources                           10
spc_Logistics and Supply Chain                24
spc_Entertainment                             12
spc_Computer & Network Security & Hardware    10
spc_Real_estate                                7
spc_Marketing and Advertising                 34
spc_commerce                                  40
spc_mobile                                    63
spc_app                                       33
spc_analysis                                  51
spc_developer                                 15
spc_security                                  16
spc_social                                    14
spc_ds                                        39
spc_travel                                     6
spc_booking_ticketing                         11
spc_Apparel         

In [12]:
data["spc_total"] = data.loc[:,"spc_Food Business":"spc_robotics_manufacturing"].apply(
    lambda row: sum(row), axis = 1)

In [13]:
mask = (data["spc_total"] == 0) 
data.loc[mask][["Company", "Specialties", "Description"]]

Unnamed: 0,Company,Specialties,Description
27,Magic Leap,,You already own the worldâ??s most amazing com...
38,Spiceworks,"IT, software, freeware, B2B, SMB, free busines...","We provide millions of IT pros with the tools,..."
51,Snapchat,,Experience a unique way to share life with fri...
95,Managed By Q,"Office Cleaning, Office Maintenance, Office Su...",Managed by Q is the platform for office manage...
117,Finanzcheck,"Kreditvergleich, Finanzdienstleistung","FINANZCHECK.de ist das fÃ¼hrende, unabhÃ¤ngige..."
159,Giphy,,"Animated GIFs have existed for decades, but th..."


In [14]:
data.to_csv("article_after_processing9.csv", index = False)