Multi-Label-Classifier is an assignment task given under Text Mining during my Master's Degree subject. 
The task was to develop a model that can predict the label of the article provided from Reuters Dataset and to check the performance parameters for the developed model.

In [10]:
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
stop_words = stopwords.words("english") #getting the stropwords

# install numpy 
try:
    import numpy as np
except ModuleNotFoundError:
    !pip install numpy


In [11]:
# Record test id's of train and test documnet files. Use id's to import data.

documents = reuters.fileids()
train_document_idNum = list(filter(lambda doc: doc.startswith("train"),documents))
test_document_idNum = list(filter(lambda doc: doc.startswith("test"),documents))
train_documents = [reuters.raw(doc_id) for doc_id in train_document_idNum]
test_documents = [reuters.raw(doc_id) for doc_id in test_document_idNum]

In [12]:
# define the "Tokenize" function.

from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
 
cachedStopWords = stopwords.words("english")
def tokenize(text):
  min_length = 3
  words = map(lambda word: word.lower(), word_tokenize(text))
  words = [word for word in words if word not in cachedStopWords]
  tokens = (list(map(lambda token: PorterStemmer().stem(token),words)))
  p = re.compile('[a-zA-Z]+');
  filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length,tokens))
  return filtered_tokens

In [13]:
# Tokenize, TDIDF the document and creat matrix on train and test data. test data is used to fit.

vector = TfidfVectorizer(stop_words=stop_words,tokenizer=tokenize)
vectored_train_docs = vector.fit_transform(train_documents)
vectored_test_docs = vector.transform(test_documents) 

  'stop_words.' % sorted(inconsistent))


In [14]:
# Create matrix of labels for the train and test data.

mlb = MultiLabelBinarizer()
train_docs_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_document_idNum])
test_docs_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_document_idNum]) 

In [15]:
# Train the classifier and make the predictions.

classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectored_train_docs, train_docs_labels) 
prediction_value = classifier.predict(vectored_test_docs)

In [16]:
# Calculate the performance parameters.


from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_docs_labels, prediction_value, average='micro')
recall = recall_score(test_docs_labels, prediction_value, average='micro')
f1 = f1_score(test_docs_labels, prediction_value, average='micro')
print("Micro-average Evaluation:")
print("\nPrecision: {:.3f}\nRecall: {:.3f} \nF1-measure: {:.3f}\n" .format(precision, recall, f1)) 
precision = precision_score(test_docs_labels, prediction_value, average='macro')
recall = recall_score(test_docs_labels, prediction_value, average='macro')
f1 = f1_score(test_docs_labels, prediction_value, average='macro') 
print("Macro-average Evaluation:")
print("\nPrecision: {:.3f} \nRecall: {:.3f} \nF1-measure: {:.3f}\n" .format(precision, recall, f1))

Micro-average Evaluation:

Precision: 0.945
Recall: 0.801 
F1-measure: 0.867

Macro-average Evaluation:

Precision: 0.649 
Recall: 0.395 
F1-measure: 0.466



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Report of Performance -
Evaluation displaying high F1 score on micro-average of 86.7% but F1 score on macro-average is 46.6% which is too less when compared to micro-average. It means Classifier is good for overall classification of Datum but is not efficient for certain class.

In [17]:
# Pipeline accepting text as input and predicting the class.

ip_text = """ Trading nickel was one of the important step of industrialization. Trade increased with 
                the gloabalization. """

vectored_ip_text = vector.transform([ip_text])
predict_vectored_ip_text = classifier.predict(vectored_ip_text)
label_ip_text = mlb.inverse_transform(np.array(predict_vectored_ip_text))
print(label_ip_text)

[('nickel', 'trade')]
