In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('data/DataSet_postocr_train.csv')
df_train.columns

Index(['datasheet_link', 'target_col', 'raw_text', 'page_count'], dtype='object')

In [3]:
df_test = pd.read_csv('data/DataSet_postocr_test.csv')
df_test.columns

Index(['datasheet_link', 'target_col', 'raw_text', 'page_count'], dtype='object')

In [4]:
df = pd.concat([df_train,df_test])
df.shape

(2970, 4)

In [5]:
print(df.shape)
df.head()

(2970, 4)


Unnamed: 0,datasheet_link,target_col,raw_text,page_count
0,https://lfillumination.com/files/specsheets/EF...,lighting,"EF400 System # EF408B\nILLUMINATION""\nDIE CAST...",1.0
1,https://lfillumination.com/files/specsheets/EF...,lighting,,
2,https://lfillumination.com/files/specsheets/EF...,lighting,"EF400 System # EF407B\nILLUMINATION""\nDIE CAST...",1.0
3,https://www.waclighting.com/storage/waclightin...,lighting,ADJUSTABLE BEAM WALL WASH 12V\nWAC\n5221\nLAND...,4.0
4,https://www.acuitybrands.com/api/products/geta...,lighting,HEALTHCARE\nLIGHTING®\nHDMC\nSPECIFICATIONS\nT...,11.0


In [6]:
# checking null values
df.isna().sum()

datasheet_link       0
target_col           0
raw_text          1616
page_count        1467
dtype: int64

In [7]:
# class distribution
df.target_col.value_counts()

target_col
cable       1025
others       695
fuses        650
lighting     600
Name: count, dtype: int64

In [8]:
# drop rows where raw_text is null
df_cleaned = df.dropna(subset=['raw_text'])
df_cleaned.shape

(1354, 4)

The number of rows dropped from 2970 to 1354 (55% dataset values removed). This is due to following reasons:
1. Invalid URLs - Multiple urls where files were not downloaded
2. PDF to Image conversion error - Very few cases where pdf was not convertable to images (this can be fixed using other packages/open cv)

In [9]:
# class distribution after removing null values
df_cleaned.target_col.value_counts()

target_col
fuses       441
cable       375
lighting    332
others      206
Name: count, dtype: int64

In [10]:
df_cleaned.columns

Index(['datasheet_link', 'target_col', 'raw_text', 'page_count'], dtype='object')

In [11]:
df_cleaned = df_cleaned[['raw_text','target_col']]

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit

In [13]:
le = LabelEncoder()
df_cleaned['target_col_encoded'] = le.fit_transform(df_cleaned['target_col'])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [14]:
X = df_cleaned['raw_text']
y = df_cleaned['target_col_encoded']

In [15]:
sss = StratifiedShuffleSplit(test_size=0.3, random_state=42)
for train,test in sss.split(X,y):     
    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
print(y_train.value_counts().sum())  
print(y_test.value_counts().sum())

947
407


# SGD

In [16]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('sgd', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

sgd.fit(X_train, y_train)

In [17]:
y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.995085995085995
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.99      1.00      1.00       132
           2       0.99      1.00      1.00       100
           3       1.00      0.97      0.98        62

    accuracy                           1.00       407
   macro avg       1.00      0.99      0.99       407
weighted avg       1.00      1.00      1.00       407



# Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

rfc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('rfc', RandomForestClassifier()),
               ])

rfc.fit(X_train, y_train)

In [19]:
y_pred = rfc.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.9975429975429976
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.99      1.00      1.00       132
           2       1.00      1.00      1.00       100
           3       1.00      0.98      0.99        62

    accuracy                           1.00       407
   macro avg       1.00      1.00      1.00       407
weighted avg       1.00      1.00      1.00       407



# Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

lr = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('lr', LogisticRegression(n_jobs=1, C=1e5)),
               ])

lr.fit(X_train, y_train)

In [21]:
y_pred = lr.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.995085995085995
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.99      1.00      1.00       132
           2       0.99      1.00      1.00       100
           3       1.00      0.97      0.98        62

    accuracy                           1.00       407
   macro avg       1.00      0.99      0.99       407
weighted avg       1.00      1.00      1.00       407



# SVC

In [22]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

svc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('lr', LinearSVC()),
               ])

svc.fit(X_train, y_train)



In [23]:
y_pred = svc.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.9926289926289926
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.99      1.00      1.00       132
           2       0.98      1.00      0.99       100
           3       1.00      0.95      0.98        62

    accuracy                           0.99       407
   macro avg       0.99      0.99      0.99       407
weighted avg       0.99      0.99      0.99       407



In [24]:
import pickle

In [25]:
with open('lr_classifier_v1.pkl', "wb") as f:
    pickle.dump(lr, f)

All models have accuracy above 99%. It is highly likely that the model has overfit due to limited data and unbalanced class distribution.
Basis my experiecne, I have observed Logistic regression to be quiet accuracte in scenarios when we have limited data and all model accuracy lies in same range.

Hence final model = Logistic regression

# Inference

In [26]:
with open('lr_classifier_v1.pkl', 'rb') as doc_cat_file:
    model = pickle.load(doc_cat_file)

In [27]:
text = '''
LumenArt CYP Series LIGHTING SOLUTIONS CYP8x24 CYP8X36 CYP8X48 SPECIFICATIONS Construction Mounting Electrical Fabric laminated to self extin- Mounts to octagon junction ETL listed dry location only. guishing vinyl. Transparent fire box or 4x4 junction box with Specify 120 or 277v. Lamps retardant applied after fabrica- round plaster adapter. by others. tion. 40 standard fabric choices. Suspension Modifications Lumenart's standard material 72"cord.Longer lengths available Contact local representatives selection is subject to change 48stem in 24",12",8"and4' or factory for size, lamping. based on supplier availability. It segments. Longer lengths available. lens or finish modifications. is common fo material manufac Bottom Shielding turers to be out of stock for long Matte white acrylic. periods of time, change or Finishes discontinue materials without Metal hardware is available notice based on market trends in satin nickel, polished and/or material availability. nickel, bronze and black LumenArt 3333 W.47th St.,Chicago,IL 60632 Phone 773-254-0744 www.lumenart.com 
LumenArt LIGHTING SOLUTIONS ORDERING LOGIC FamilyModel Lamping Color Volt Finish Suspension Fabric Fabric Fabric Fabric Dimming CYP 8dia x12"high LED8w 2700k 120 SN Satin Nickel BC Black Cord Linen Sik Raw Silk TriLam** Cylinder 8"dia x 24"highLED 15w 3000k PN Pol.Nickel MB Metal Braid Cord White Platinum Pendant 8"dia x 36"highLED 20w 3500k ST Stem Cream Lt.Bronze Triac 8dia x48"high LED30w 4000k DLDIO LED New Copper Med.Bronz Antique Tan Orange Gold Brown 80+CRI Black(opaque) Ginge Kona Green Red Moonlight Canary Grape Coffee Rrey reen Apple Honey Stone Orange Chocolate Sunflower Lapis Black Red Amber Spice Med.Grey Parsley CYP Linen Silk RS *Washable 48in 36in 24 in 8in 8in- 8in 8in- LumenArt 3333 W.47th St.Chicago,IL 60632 Phone 773-254-0744www.lumenart.com
'''
model.predict([text])[0]

2

In [28]:
x = {
    0: 'cable',
    1: 'fuses',
    2: 'lighting',
    3: 'others'
}

In [29]:
predicted_probabilities = lr.predict_proba([text])[0]
predicted_category_index = predicted_probabilities.argmax()
predicted_category = x[predicted_category_index]
confidence_score = predicted_probabilities[predicted_category_index]
predicted_category,str(round(confidence_score, 2))

('lighting', '1.0')