# Industry sector prediction - A basic NLP example

Goal : Given the official name of a company, predict the sector of the company

This is a multiclass classification, that is based on a very short amount of text for each company

In [129]:
import pandas as pd
import numpy as np
import random

In [130]:
from nltk.corpus import stopwords
import nltk
import re

## Imbalance issues - Resampling

Each of the industry labels are very imbalanced, which will lead to imbalance prediction accuracies. We choose to deal with this issue by resampling so that each label gets represented 100 times

In [131]:
df = pd.read_csv("private us companies.csv")

In [132]:
df = df[["Company","Industry"]]

In [133]:
df.head()

Unnamed: 0,Company,Industry
0,A - E Employees Credit Union,Diversified Financial Services
1,"A & A Contract Services, Inc.",Professional Services
2,"A & A Express, Inc.",Road and Rail
3,"A & A Fertilizer, Ltd.",Chemicals
4,A & A Food Service. Inc.,Distributors


In [134]:
count = df.groupby('Industry').count()
under = count[count.Company>=100]
over = count[count.Company< 100]

under1 = df.loc[df.Industry.isin(under.index)]
over1 = df.loc[df.Industry.isin(over.index)]

In [135]:
len(under1.Industry.value_counts()),len(over1.Industry.value_counts())

(86, 13)

In [136]:
under2 = under1.groupby('Industry', as_index=False).apply(lambda obj: obj.loc[np.random.choice(obj.index, 100, replace=False),:])

In [137]:
over2 = over1.groupby('Industry', as_index=False).apply(lambda obj: obj.loc[np.random.choice(obj.index, 100, replace=True),:])

In [138]:
df = pd.concat([under2,over2])

## Text processing

Company names require attentive preprocessing because there might be words that do not exist, special characters etc. The function below does this preprocessing, and is then applied to the whole text column. The most important preprocessing done here is the stemming, that allows us to extract the root of each word.

In [139]:
def genCorpus(theText):
    #set dictionaries
    stopWords = set(stopwords.words('english'))
    theStemmer = nltk.stem.porter.PorterStemmer() #Martin Porters celebrated stemming algorithm
    
    #pre-processing
    theText = theText.split()
    tokens = [token.lower() for token in theText] #ensure everything is lower case
    tokens = [re.sub(r'[^a-zA-Z0-9]+', ' ',token) for token in tokens] #remove special characters but leave word in tact
    tokens = [token for token in tokens if token.lower().isalpha()] #ensure everything is a letter
    #tokens = [word for word in tokens if word not in stopWords] #rid of stop words
    tokens = [theStemmer.stem(word) for word in tokens] #stem words uing porter stemming algorithm
    tokens = " ".join(tokens) #need to pass string seperated by spaces       

    return tokens

In [140]:
df.Company = df.Company.apply(genCorpus)

In [141]:
df.head(10)

Unnamed: 0,Unnamed: 1,Company,Industry
0,21184,arcadia aerospac llc,Aerospace and Defense
0,164026,lake region tubular,Aerospace and Defense
0,247256,rotorcraft leas,Aerospace and Defense
0,119663,global analyt inform technolog,Aerospace and Defense
0,254335,seakr incorpor,Aerospace and Defense
0,136611,honeycomb compani of,Aerospace and Defense
0,211155,ontic engin and,Aerospace and Defense
0,246314,roger helicopt,Aerospace and Defense
0,267095,spawar system center atlant,Aerospace and Defense
0,8918,aircraft dynam corpor,Aerospace and Defense


In [142]:
names = list(df.Company)

## Vectorizing the names

First we vectorize the bag of words and apply the tfidf methodology to it. Then, we reduce dimensionality with a PCA

In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [144]:
vectorizer = TfidfVectorizer(max_features=1000,ngram_range=(1,1))
tdm = pd.DataFrame(vectorizer.fit_transform(names).toarray())
tdm.columns = vectorizer.get_feature_names()

In [145]:
pca = decomposition.PCA(n_components=.95)
pca.fit(tdm)
reducedTDM = pd.DataFrame(pca.transform(tdm)) #reduced tdm distance matrix

## Fitting a classifier

With these features, we can then train a random forest classifier to predict the industry labels

In [146]:
model = RandomForestClassifier()

In [147]:
model.fit(reducedTDM,df.Industry)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [148]:
y_pred = model.predict(reducedTDM)

In [149]:
f1_score(df.Industry,y_pred, average='weighted')

0.60484897220237965

Our classification yields a f1 score of **0.60**, which is actually quite satisfactory given how little data we are using for the prediction

In [150]:
entry = 'General Electrics'

In [151]:
model.predict(pd.DataFrame(pca.transform(pd.DataFrame(vectorizer.transform([genCorpus(entry)]).toarray()))))[0]

'Electric Utilities'