In [1]:
import pandas as pd
df = pd.read_pickle("./preprocessed_data.pkl")

In [2]:
df['text'] = df['description'].astype(str) + ' ' + df['html2text']
df.head()

Unnamed: 0,industry_id,len_description,len_html2text,description,html2text,text
0,0,598,3254,webhostcom offers budget and unlimited web hos...,reliable web hosting services from webhosth fo...,webhostcom offers budget and unlimited web hos...
1,1,323,18637,we are a direct cash advance provider with fun...,abc merchant funding advanced business capital...,we are a direct cash advance provider with fun...
2,2,681,11663,able investigation enforcements are an establi...,able investigations bristol based enforcement ...,able investigation enforcements are an establi...
3,3,1125,1067,for over two decades abm has been known for it...,abm group of companyhome site map client login...,for over two decades abm has been known for it...
4,0,160,12,additionally lets you easily create the best p...,additionally,additionally lets you easily create the best p...


In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=32)
for train_index, test_index in sss.split(df['text'], df["industry_id"]):
    X_train, X_test = df['text'][train_index], df['text'][test_index]
    y_train, y_test = df["industry_id"][train_index], df["industry_id"][test_index]

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
import Stemmer
english_stemmer = Stemmer.Stemmer('en')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

model_description = Pipeline([
    ('vect', StemmedTfidfVectorizer(stop_words='english', ngram_range=(1,1))),
    ('clf', LinearSVC(C=0.5))
])

In [6]:
model_description.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', StemmedTfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), norm='l2', preprocessor=None,
...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [7]:
pred_train = model_description.predict(X_train)
pred_test = model_description.predict(X_test)

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred_train, y_train)

array([[1119,    0,    0, ...,    0,    0,    1],
       [   2,  819,    0, ...,    1,    0,    0],
       [   0,    0,  274, ...,    0,    0,    0],
       ...,
       [   0,    0,    1, ...,  191,    0,    0],
       [   0,    0,    1, ...,    0,  176,    0],
       [   0,    0,    0, ...,    0,    0,  151]], dtype=int64)

In [9]:
accuracy_score(y_train, pred_train)

0.972256994958893

In [10]:
confusion_matrix(pred_test, y_test)

array([[ 81,   3,   0, ...,   0,   0,   1],
       [  6, 154,   2, ...,   0,   1,   0],
       [  0,   0,  19, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,  18,   0,   0],
       [  1,   0,   1, ...,   0,  26,   0],
       [  0,   0,   0, ...,   0,   0,  15]], dtype=int64)

In [11]:
accuracy_score(y_test, pred_test)

0.506765899864682

In [12]:
from sklearn import metrics
print(metrics.classification_report(y_train, pred_train))
print(metrics.classification_report(y_test, pred_test))

             precision    recall  f1-score   support

          0       0.98      0.96      0.97      1165
          1       0.97      0.97      0.97       844
          2       1.00      0.93      0.96       295
          3       0.94      0.97      0.96      2329
          4       1.00      0.97      0.98       424
          5       0.98      0.98      0.98       395
          6       0.99      0.96      0.98       441
          7       0.82      0.97      0.89       298
          8       0.98      0.96      0.97       461
          9       0.98      0.98      0.98       262
         10       0.99      0.97      0.98       205
         11       0.95      0.99      0.97      1098
         12       0.95      0.99      0.97       464
         13       0.98      0.98      0.98       442
         14       0.99      0.98      0.99       586
         15       0.98      0.94      0.96       756
         16       0.93      0.98      0.96      1932
         17       0.99      0.96      0.98   