### Importing libraries

In [1]:
import re
import nltk
import string
import pickle
import numpy as np
import pandas as pd
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("punkt_tab")
nltk.download('stopwords')
from nltk.corpus import stopwords
from xgboost import XGBClassifier
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textacy.preprocessing import replace
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score,recall_score, accuracy_score, f1_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\harma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Reading the data

In [2]:
train = pd.read_excel("train.xlsx")
test = pd.read_excel("test_data.xlsx")

### Preprocessing the data

In [3]:
train.head()

Unnamed: 0,datasheet_link,target_col,text
0,https://lfillumination.com/files/specsheets/EF...,lighting,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,https://lfillumination.com/files/specsheets/EF...,lighting,Error:HTTP Error 404: Not Found
2,https://lfillumination.com/files/specsheets/EF...,lighting,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,https://www.acuitybrands.com/api/products/geta...,lighting,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
4,https://www.acuitybrands.com/api/products/geta...,lighting,Error:HTTP Error 404: Not Found


In [4]:
def error_preprocessing(text):
    '''Remove the Errors'''
    if str(text).startswith("Error"):
        return None
    return text
    
train['text'] = train['text'].apply(error_preprocessing)
test['text'] = test['text'].apply(error_preprocessing)

In [5]:
train.head()

Unnamed: 0,datasheet_link,target_col,text
0,https://lfillumination.com/files/specsheets/EF...,lighting,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,https://lfillumination.com/files/specsheets/EF...,lighting,
2,https://lfillumination.com/files/specsheets/EF...,lighting,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,https://www.acuitybrands.com/api/products/geta...,lighting,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
4,https://www.acuitybrands.com/api/products/geta...,lighting,


### Missing values

In [6]:
train.isnull().sum()

datasheet_link      0
target_col          0
text              528
dtype: int64

In [7]:
train.dropna(inplace=True)
train.reset_index(drop=True, inplace=True)
test.dropna(inplace=True)
test.reset_index(drop=True, inplace=True)

In [8]:
train.drop(['datasheet_link'],axis=1,inplace=True)
test.drop(['datasheet_link'],axis=1,inplace=True)

In [9]:
train.isnull().sum()

target_col    0
text          0
dtype: int64

In [10]:
train.shape

(1367, 2)

### Distribution of the target column

In [11]:
train.target_col.value_counts()

target_col
fuses       465
lighting    446
cable       341
others      115
Name: count, dtype: int64

### Text Preprocessing

In [12]:
def preprocessing(text):
    '''Function for performing the preprocessing of the text'''
     # Convert text to lowercase to ensure uniformity
    text = text.lower()
    # remobing urls
    text = replace.urls(text,'')
    # Remove tabs, digits, newlines, and specific punctuation marks like quotes and dashes
    text = re.sub(r"[\t\d\n'“”„-]+", "", text)
    # Remove all non-alphabetic characters except for spaces
    text = re.sub(r"[^a-z\s]",'',text)
    # Remove any remaining punctuation using str.translate and string.punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Strip leading and trailing whitespace from the text
    text = text.strip()
    # Initialize the set of stopwords in English
    stop_words = set(stopwords.words('english'))
     # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Tokenize the text into words
    words = word_tokenize(text)
    # Filter out stopwords and apply lemmatization
    text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words])
    return text

train['text'] = train['text'].apply(preprocessing)
test['text'] = test['text'].apply(preprocessing)

In [13]:
train.head()

Unnamed: 0,target_col,text
0,lighting,ef system efbdie cast cylindrical light unit s...
1,lighting,ef system efbdie cast cylindrical light unit h...
2,lighting,typeprojecthdmcsurface mount faruvc filtered n...
3,lighting,extruded aluminumhousingsatin acrylicdiffuser ...
4,lighting,die cast aluminumhousing adjustablepower suppl...


### Checking for Missing values

In [14]:
train.isnull().sum()

target_col    0
text          0
dtype: int64

In [15]:
train.head()

Unnamed: 0,target_col,text
0,lighting,ef system efbdie cast cylindrical light unit s...
1,lighting,ef system efbdie cast cylindrical light unit h...
2,lighting,typeprojecthdmcsurface mount faruvc filtered n...
3,lighting,extruded aluminumhousingsatin acrylicdiffuser ...
4,lighting,die cast aluminumhousing adjustablepower suppl...


### Splitting the data into train and test dataset

In [17]:
X_train = train['text']
X_test = test['text']
y_train = train['target_col']
y_test = test['target_col']
del train, test

### Converting the text data into numeric format

In [19]:
vectorizer = CountVectorizer(ngram_range=(1,2), max_features=1000)
X_train= vectorizer.fit_transform(X_train.ravel()).toarray()
X_test = vectorizer.transform(X_test.ravel()).toarray()

### Converting target variable into numeric format

In [21]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

### Hyperparameter tunning

In [28]:
xgb = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=7,
 min_child_weight=8,
 gamma=0.0,
 subsample=0.9,
 colsample_bytree=0.6,
 objective= 'binary:logistic',
 nthread=4,
 device = 'cuda',
 seed=27)

param_test = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}

clf_xgb = GridSearchCV(xgb, param_test, verbose = 1, cv=3, scoring="accuracy")
clf_xgb.fit(X_train,y_train)
print("Tuned Hyperparameters :", clf_xgb.best_params_)
print("Accuracy :",clf_xgb.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Tuned Hyperparameters : {'colsample_bytree': 0.8, 'subsample': 0.85}
Accuracy : 0.9824256153203522


### Results

In [29]:
y_pred = clf_xgb.predict(X_test)
confusion_matrix(y_test, y_test)

array([[ 97,   0,   0,   0],
       [  0,  49,   0,   0],
       [  0,   0,  83,   0],
       [  0,   0,   0, 105]], dtype=int64)

In [33]:
print(precision_score(y_test,y_pred,average='micro'))
print(recall_score(y_test,y_pred,average='micro'))
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average='micro'))


0.8023952095808383
0.8023952095808383
0.8023952095808383
0.8023952095808383


### Saving the executables

In [36]:
with open("model.h5",'wb') as model:
    pickle.dump(clf_xgb,model,protocol=pickle.HIGHEST_PROTOCOL)

In [38]:
with open("vectorizer.pkl",'wb') as data_vectorize:
    pickle.dump(vectorizer,data_vectorize,protocol=pickle.HIGHEST_PROTOCOL)
with open("labelencoder.pkl",'wb') as label:
    pickle.dump(le,label,protocol=pickle.HIGHEST_PROTOCOL)