# Multi-label text classification Tag predication

### Importing required libraries

In [114]:
import pandas as pd
import numpy as np
import ast

# TF-IDF is Term Frequency Inverse Document Frequency
# TF_IDF is used to transform text into meaningful representation of numbers which is used to fit ml algorithm for prediction
# Count Vectorizer gives no. of frequency of words
# TFIDF is a numerical statistic that reflects the importance of a word in a document. TfidfVectorizer is a tool that converts a collection of text documents into a matrix of TF-IDF features.
# It computes the TF-IDF statistic for each word in each document and returns a sparse matrix of the TF-IDF values.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier 
# Stochastic Gradient Descent Classifier :
'''
SGD is an optimization algorithm commonly used for training linear classifiers. SGDClassifier is a tool that implements SGD, for classification tasks
. It iteratively updates the model's parameters to minimize the loss function, which measures the difference between the predicted and actual labels. 
This process continues until the model's accuracy on the training data reaches a desired level. SGDClassifier is used for large-scale machine learning tasks.
'''
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

'''
OneVsRestClassifier is a strategy for multi-class classification problems where each class is treated as a seperate binary classification problem.
In this approach, a seperate binary classifier is trained for each class. When making predications, the model returns the class with the highest probabilities among all the binary classifiers
. OneVsRestClassifier is a toool that implements this strategy using SKlearn
'''
from sklearn.multiclass import OneVsRestClassifier

### Importing the dataset from csv to dataframe using pandas

In [115]:
data = pd.read_csv('./stackoverflow.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Text,Tags
0,2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
1,4,adding scripting functionality to net applicat...,"['c#', '.net']"
2,5,should i use nested classes in this case i am ...,['c++']
3,6,homegrown consumption of web services i have b...,['.net']
4,8,automatically update version number i would li...,['c#']


In [116]:
data['Tags'].iloc[0]
# As can be seen, the list is a string, so we use ast library for literal conversion

"['sql', 'asp.net']"

In [117]:
data['Tags'] = data['Tags'].apply(lambda x : ast.literal_eval(x))
print(data['Tags'].iloc[0])
# Now it is converted to list

['sql', 'asp.net']


In [118]:
y = data['Tags']
y

0        [sql, asp.net]
1            [c#, .net]
2                 [c++]
3                [.net]
4                  [c#]
              ...      
48971             [c++]
48972             [c++]
48973          [python]
48974          [python]
48975             [c++]
Name: Tags, Length: 48976, dtype: object

In [119]:
mlb = MultiLabelBinarizer()

In [120]:
#Showing the text labels in the form of one hot encoding
y = mlb.fit_transform(data['Tags'])
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [121]:
mlb.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [122]:
pd.DataFrame(y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [123]:
pd.DataFrame(y,columns = mlb.classes_)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [124]:
# TFIDF Vectorizer :
'''
Choosing analyzer as word makes it analyze word by word. 
Choosing it as character, makes it analyze character by character
Max Features will make sure that we don't select the dictionary size more than 1000 values.
'''

# Max features is selected as 5000 words
# NGrams : Bigram(1,2)
# Removed Stop Words
tfidf = TfidfVectorizer(analyzer = 'word' , max_features = 10000, ngram_range = (1,2), stop_words = 'english')
X = tfidf.fit_transform(data['Text'])

In [125]:
X

<48976x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1996262 stored elements in Compressed Sparse Row format>

In [126]:
# Limited to 5000
tfidf.vocabulary_

{'aspnet': 863,
 'site': 7836,
 'maps': 5292,
 'got': 3751,
 'experience': 3135,
 'creating': 2135,
 'default': 2373,
 'xml': 9932,
 'file': 3302,
 'working': 9845,
 'properly': 6717,
 'menu': 5390,
 'controls': 2027,
 'need': 5688,
 'way': 9645,
 'users': 9282,
 'create': 2114,
 'modify': 5528,
 'pages': 6213,
 'tie': 8734,
 'page': 6199,
 'viewing': 9508,
 'permissions': 6338,
 'standard': 8029,
 'membership': 5372,
 'xml file': 9934,
 'need way': 5716,
 'adding': 449,
 'scripting': 7527,
 'functionality': 3611,
 'net': 5728,
 'applications': 760,
 'little': 5079,
 'game': 3632,
 'written': 9899,
 'uses': 9285,
 'database': 2275,
 'backend': 991,
 'card': 1341,
 'wanted': 9630,
 'implement': 4172,
 'function': 3580,
 'cards': 1342,
 'mean': 5347,
 'essentially': 3009,
 'interface': 4445,
 'class': 1511,
 'implements': 4178,
 'public': 6757,
 'contains': 1989,
 'called': 1310,
 'make': 5244,
 'thing': 8604,
 'like': 4953,
 'source': 7931,
 'code': 1662,
 'compile': 1822,
 'use': 9198,

In [127]:
# X are the 1000 most important 1000 words from the texts shown
# Y are the target labels [.....] in OneHotEncoding format
X.shape , y.shape

((48976, 10000), (48976, 20))

In [128]:
# Stratify signifies that we divide the train and test in equal ratio...
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

### Building our model

In [129]:
sgd = SGDClassifier()
lr = LogisticRegression()
svc = LinearSVC()

In [130]:
# Function for Jaccard score
def j_score(y_true, y_pred, clf):
    jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true,y_pred).sum(axis = 1)
    print("Clf : ", clf.__class__.__name__)
    print("Jaccard Output : ", jaccard)
    print("Jaccard Score : " , jaccard.mean() * 100)
    print("----")
    return jaccard.mean()*100

In [131]:
for classifier in [sgd, lr, svc]:
    # In the output we have 20 classes, in which, every time we will select 1 at a time and use the rest 19 as Rest of the classes
    clf = OneVsRestClassifier(estimator = classifier)
    clf.fit(X_train, Y_train)
    # Using jaccard score 
    j_score(Y_test, clf.predict(X_test),classifier)

Clf :  SGDClassifier
Jaccard Output :  [1.  0.  1.  ... 0.5 1.  0.5]
Jaccard Score :  52.27252620117054
----
Clf :  LogisticRegression
Jaccard Output :  [1.  0.  1.  ... 0.5 1.  0.5]
Jaccard Score :  50.86072546617667
----
Clf :  LinearSVC
Jaccard Output :  [1.  0.  1.  ... 0.5 1.  0.5]
Jaccard Score :  62.510038110793516
----


### Model validation with real data

In [132]:
x = ["How to write ML code in python and Java, I have no idea on how to do it"]

In [133]:
xt = tfidf.transform(x)

In [134]:
xt

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [135]:
pred = clf.predict(xt)
pred

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [136]:
mlb.inverse_transform(pred)

[('java', 'python')]