In [1]:
import pandas as pd
df = pd.read_pickle("./preprocessed_data.pkl")

In [2]:
df.head()

Unnamed: 0,industry_id,len_description,len_html2text,description,html2text
0,0,598,3254,webhostcom offers budget and unlimited web hos...,reliable web hosting services from webhosth fo...
1,1,323,18637,we are a direct cash advance provider with fun...,abc merchant funding advanced business capital...
2,2,681,11663,able investigation enforcements are an establi...,able investigations bristol based enforcement ...
3,3,1125,1067,for over two decades abm has been known for it...,abm group of companyhome site map client login...
4,0,160,12,additionally lets you easily create the best p...,additionally


In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=32)
for train_index, test_index in sss.split(df, df["industry_id"]):
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = df["industry_id"][train_index], df["industry_id"][test_index]

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
import Stemmer
english_stemmer = Stemmer.Stemmer('en')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

model_description = Pipeline([
    ('vect', StemmedTfidfVectorizer(stop_words='english', ngram_range=(1,1))),
    ('clf', LinearSVC(C=2))
])

In [8]:
model_description.fit(X_train['description'], y_train)

Pipeline(memory=None,
     steps=[('vect', StemmedTfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), norm='l2', preprocessor=None,
...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [9]:
model_html2text = Pipeline([
    ('vect', StemmedTfidfVectorizer(stop_words='english', ngram_range=(1,1))),
    ('clf', LinearSVC(C=2))
])

In [10]:
model_html2text.fit(X_train['html2text'], y_train)

Pipeline(memory=None,
     steps=[('vect', StemmedTfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), norm='l2', preprocessor=None,
...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [11]:
X_train_stacked = pd.DataFrame()
X_train_stacked['len_description'] = X_train['len_description']
X_train_stacked['len_html2text'] = X_train['len_html2text']
X_train_stacked['pred_description'] = model_description.predict(X_train['description'])
X_train_stacked['pred_html2text'] = model_html2text.predict(X_train['html2text'])

In [12]:
from xgboost import XGBClassifier
model_stacked = XGBClassifier(max_depth=3, n_estimators=50, learning_rate=0.1)

In [13]:
model_stacked.fit(X_train_stacked, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [14]:
pred_train = model_stacked.predict(X_train_stacked)

  if diff:


In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred_train, y_train)

array([[1165,    0,    0, ...,    0,    0,    0],
       [   0,  844,    0, ...,    0,    0,    0],
       [   0,    0,  295, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,  195,    0,    0],
       [   0,    0,    0, ...,    0,  178,    0],
       [   0,    0,    0, ...,    0,    0,  159]], dtype=int64)

In [16]:
accuracy_score(y_train, pred_train)

0.9977331934905437

In [17]:
from sklearn import metrics
print(metrics.classification_report(y_train, pred_train))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1165
          1       1.00      1.00      1.00       844
          2       0.99      1.00      1.00       295
          3       1.00      0.99      1.00      2329
          4       1.00      1.00      1.00       424
          5       1.00      1.00      1.00       395
          6       1.00      1.00      1.00       441
          7       1.00      1.00      1.00       298
          8       1.00      1.00      1.00       461
          9       1.00      1.00      1.00       262
         10       1.00      1.00      1.00       205
         11       1.00      1.00      1.00      1098
         12       1.00      1.00      1.00       464
         13       1.00      1.00      1.00       442
         14       1.00      1.00      1.00       586
         15       1.00      1.00      1.00       756
         16       1.00      0.99      0.99      1932
         17       1.00      1.00      1.00   

In [18]:
X_test_stacked = pd.DataFrame()
X_test_stacked['len_description'] = X_test['len_description']
X_test_stacked['len_html2text'] = X_test['len_html2text']
X_test_stacked['pred_description'] = model_description.predict(X_test['description'])
X_test_stacked['pred_html2text'] = model_html2text.predict(X_test['html2text'])
pred_test = model_stacked.predict(X_test_stacked)

  if diff:


In [19]:
confusion_matrix(pred_test, y_test)

array([[ 91,   3,   1, ...,   0,   0,   1],
       [  5, 150,   2, ...,   0,   1,   0],
       [  0,   0,  20, ...,   0,   0,   0],
       ...,
       [  0,   1,   0, ...,  20,   1,   0],
       [  0,   0,   1, ...,   0,  30,   0],
       [  0,   0,   0, ...,   0,   0,  19]], dtype=int64)

In [20]:
accuracy_score(y_test, pred_test)

0.5259810554803789

In [21]:
print(metrics.classification_report(y_test, pred_test))

             precision    recall  f1-score   support

          0       0.28      0.31      0.30       292
          1       0.64      0.71      0.67       211
          2       0.50      0.27      0.35        74
          3       0.46      0.57      0.51       582
          4       0.54      0.49      0.51       106
          5       0.58      0.65      0.61        99
          6       0.46      0.42      0.44       110
          7       0.60      0.50      0.54        74
          8       0.73      0.67      0.70       115
          9       0.71      0.75      0.73        65
         10       0.52      0.29      0.38        51
         11       0.56      0.63      0.59       275
         12       0.69      0.81      0.75       116
         13       0.58      0.59      0.58       110
         14       0.43      0.47      0.45       146
         15       0.34      0.26      0.29       189
         16       0.52      0.59      0.55       483
         17       0.52      0.47      0.50   

In [22]:
print(metrics.classification_report(y_train, X_train_stacked['pred_description']))
print(metrics.classification_report(y_train, X_train_stacked['pred_html2text']))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1165
          1       1.00      0.99      0.99       844
          2       0.99      1.00      0.99       295
          3       0.96      0.99      0.97      2329
          4       1.00      1.00      1.00       424
          5       0.99      0.99      0.99       395
          6       1.00      0.99      1.00       441
          7       1.00      1.00      1.00       298
          8       1.00      0.99      0.99       461
          9       0.98      0.99      0.99       262
         10       1.00      1.00      1.00       205
         11       0.98      0.99      0.99      1098
         12       1.00      0.99      1.00       464
         13       1.00      1.00      1.00       442
         14       1.00      0.99      1.00       586
         15       1.00      0.99      1.00       756
         16       0.99      0.99      0.99      1932
         17       1.00      0.99      0.99   

In [23]:
print(metrics.classification_report(y_test, X_test_stacked['pred_description']))
print(metrics.classification_report(y_test, X_test_stacked['pred_html2text']))

             precision    recall  f1-score   support

          0       0.29      0.28      0.29       292
          1       0.64      0.71      0.68       211
          2       0.50      0.27      0.35        74
          3       0.44      0.59      0.50       582
          4       0.54      0.49      0.51       106
          5       0.57      0.65      0.61        99
          6       0.47      0.42      0.44       110
          7       0.60      0.50      0.54        74
          8       0.72      0.66      0.69       115
          9       0.71      0.78      0.74        65
         10       0.52      0.29      0.38        51
         11       0.55      0.66      0.60       275
         12       0.69      0.79      0.74       116
         13       0.58      0.59      0.58       110
         14       0.44      0.47      0.45       146
         15       0.34      0.26      0.29       189
         16       0.52      0.63      0.57       483
         17       0.51      0.46      0.49   