In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'

In [2]:
input_df = pd.read_csv('combined_data.csv',encoding = 'unicode_escape')
input_df = input_df.fillna('')
input_df = input_df.replace('?','')

In [3]:
feature_df = input_df[['Type','Product','industry','product_info']]

In [4]:
x_train, x_test = train_test_split(
 feature_df, test_size=0.1, random_state=50)

In [5]:
from bs4 import BeautifulSoup
import regex
data_columns = ['product_info']
Y_columns = ['Type','Product','industry']
def preprocess_dataframe(input_df,data_columns,Y_columns):


    df = input_df.loc[:,Y_columns]

    df['text'] = input_df[data_columns].apply(lambda x: ' '.join(x.map(str)), axis=1)
    df['text'] = df['text'].apply( lambda x: BeautifulSoup(str(x),'html.parser').get_text())

    pattern = regex.compile('[\W\d_]+', regex.UNICODE)
    df['text'] = df['text'].apply( lambda x: pattern.sub(' ',str(x)))
    return df

In [6]:
df_train = preprocess_dataframe(x_train,data_columns,Y_columns)

In [7]:
df_valid = preprocess_dataframe(x_test,data_columns,Y_columns)

In [8]:
from nltk.corpus import stopwords
language_stop_words = stopwords.words('english')

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2) #ngram_range=(1,2)

import numpy as np

#https://stackoverflow.com/a/55742601/4634344
vectorizer.fit(df_train['text'].apply(lambda x: np.str_(x)))
X_train = vectorizer.transform(df_train['text'].apply(lambda x: np.str_(x)))

# we need the class labels encoded into integers for functions in the pipeline
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
Y_train = oe.fit_transform(df_train[Y_columns].values.reshape(-1, 3))

X_valid = vectorizer.transform(df_valid['text'].apply(lambda x: np.str_(x)))
Y_valid = oe.transform(df_valid[Y_columns].values.reshape(-1, 3))

print('X training shape', X_train.shape, X_train.dtype)
print('Y training shape', Y_train.shape, Y_train.dtype)
print('X validation shape', X_valid.shape, X_valid.dtype)
print('Y validation shape', Y_valid.shape, Y_valid.dtype)

X training shape (21518, 24840) float64
Y training shape (21518, 3) float64
X validation shape (2391, 24840) float64
Y validation shape (2391, 3) float64


In [9]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import SGDClassifier

clf=ClassifierChain(SGDClassifier(random_state=0, class_weight='balanced', n_jobs=-1))

In [10]:
from sklearn.metrics import jaccard_score, f1_score, make_scorer

def concat_categories(Y):
  return np.apply_along_axis(lambda a: str(a[0]) + '-' + str(a[1]), 1, Y)

# score for predicting category_1
def js_0(y,y_pred, **kwargs):
  return jaccard_score(y[:,0], y_pred[:,0], average='micro')
# score for predicting category_2
def js_1(y,y_pred, **kwargs):
  return jaccard_score(y[:,1], y_pred[:,1], average='micro')
def js_2(y,y_pred, **kwargs):
  return jaccard_score(y[:,2], y_pred[:,2], average='micro')

def f1_0(y,y_pred, **kwargs):
  return f1_score(y[:,0], y_pred[:,0], average='micro')
def f1_1(y,y_pred, **kwargs):
  return f1_score(y[:,1], y_pred[:,1], average='micro')
def f1_2(y,y_pred, **kwargs):
  return f1_score(y[:,2], y_pred[:,2], average='micro')

# score for predicting 'category_1-category_2' (concatenated strings)
def js_01(y,y_pred, **kwargs):
  return jaccard_score(concat_categories(y), concat_categories(y_pred), average='micro')
def f1_01(y,y_pred, **kwargs):
  return f1_score(concat_categories(y), concat_categories(y_pred), average='micro')

js_0_scorer = make_scorer(score_func=js_0, greater_is_better=True, needs_proba=False, needs_threshold=False)
js_1_scorer = make_scorer(score_func=js_1, greater_is_better=True, needs_proba=False, needs_threshold=False)
js_2_scorer = make_scorer(score_func=js_2, greater_is_better=True, needs_proba=False, needs_threshold=False)
#js_01_scorer = make_scorer(score_func=js_01, greater_is_better=True, needs_proba=False, needs_threshold=False)
f1_0_scorer = make_scorer(score_func=f1_0, greater_is_better=True, needs_proba=False, needs_threshold=False)
f1_1_scorer = make_scorer(score_func=f1_1, greater_is_better=True, needs_proba=False, needs_threshold=False)
f1_2_scorer = make_scorer(score_func=f1_1, greater_is_better=True, needs_proba=False, needs_threshold=False)



f1_01_scorer = make_scorer(score_func=f1_01, greater_is_better=True, needs_proba=False, needs_threshold=False)


In [11]:
def error_metric(y,y_pred, **kwargs):
    
    score1 = max(0,100*f1_score(y[:,2], y_pred[:,2], average='weighted'))
    score2 = max(0,100*f1_score(y[:,0], y_pred[:,0], average='weighted'))
    score3 = max(0,100*f1_score(y[:,1], y_pred[:,1], average='weighted'))
    return (0.5*score1+0.3*score2+0.3*score3)

In [12]:
clf.fit(X_train, Y_train)

ClassifierChain(base_estimator=SGDClassifier(alpha=0.0001, average=False,
                                             class_weight='balanced',
                                             early_stopping=False, epsilon=0.1,
                                             eta0=0.0, fit_intercept=True,
                                             l1_ratio=0.15,
                                             learning_rate='optimal',
                                             loss='hinge', max_iter=1000,
                                             n_iter_no_change=5, n_jobs=-1,
                                             penalty='l2', power_t=0.5,
                                             random_state=0, shuffle=True,
                                             tol=0.001, validation_fraction=0.1,
                                             verbose=0, warm_start=False),
                cv=None, order=None, random_state=None)

In [13]:
Y_pred = clf.predict(X_valid)

In [14]:
error_metric(Y_valid,Y_pred)

91.07660169040503

In [15]:
print('For both Level 1 and Level 2  concatenated:\n\tF1 micro (=accuracy): {}'.format(f1_01(Y_valid,Y_pred).round(3)))

For both Level 1 and Level 2  concatenated:
	F1 micro (=accuracy): 0.65


In [16]:
print('Just the Level 1:\n\tF1 micro (=accuracy): {}'.format(f1_0(Y_valid,Y_pred).round(3)))

Just the Level 1:
	F1 micro (=accuracy): 0.666


In [17]:
print('Just the Level 2:\n\tF1 micro (=accuracy): {}'.format(f1_1(Y_valid,Y_pred).round(3)))

Just the Level 2:
	F1 micro (=accuracy): 0.969


In [18]:
print('Just the Level 3:\n\tF1 micro (=accuracy): {}'.format(f1_2(Y_valid,Y_pred).round(3)))

Just the Level 3:
	F1 micro (=accuracy): 0.846
