# Loading and Preprocessing Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv',index_col=0)

In [3]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [4]:
# converting the Tags column from string to list
import ast
df['Tags']=df['Tags'].apply(lambda x: ast.literal_eval(x))

In [5]:
df.iloc[0]['Tags']

['sql', 'asp.net']

In [6]:
#encoding of multi labels
from sklearn.preprocessing import MultiLabelBinarizer
multilabel=MultiLabelBinarizer()
y=multilabel.fit_transform(df['Tags'])

In [7]:
classes=multilabel.classes_

In [8]:
# Top 20 Tags on stackoverflow
classes

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [9]:
pd.DataFrame(y,columns=classes)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [10]:
# converting the text data to tfidf vectors
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(analyzer='word',max_features=1000,ngram_range=(1,3),stop_words='english')

In [11]:
X=tfidf.fit_transform(df['Text'])

In [12]:
X

<48976x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 1239765 stored elements in Compressed Sparse Row format>

In [13]:
X.shape, y.shape

((48976, 1000), (48976, 20))

In [14]:
#splitting the dataset into train and test 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0)


In [15]:
X_train.shape, X_test.shape

((39180, 1000), (9796, 1000))

# Evaluation Metric

In [16]:
def j_score(y_true,y_pred):
    jaccard=np.minimum(y_true,y_pred).sum(axis=1)/np.maximum(y_true,y_pred).sum(axis=1)
    return jaccard.mean()*100

# Model Building

1. Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver='lbfgs')

In [18]:
from sklearn.multiclass import OneVsRestClassifier
clf=OneVsRestClassifier(lr)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [19]:
j_score(y_test,y_pred)

49.12668436096366

2. SVM

In [20]:
from sklearn.svm import LinearSVC
svm=LinearSVC(C=1.5,penalty='l1',dual=False)
clf=OneVsRestClassifier(svm)

In [21]:
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [22]:
j_score(y_test,y_pred)

53.3787600381108

3. Decision Trees

In [23]:
from sklearn import tree
clf=tree.DecisionTreeClassifier()

In [24]:
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [25]:
j_score(y_test,y_pred)

55.02671158295903

# Model Testing

In [26]:
x=['can we write sql codes in pandas, python']

In [27]:
xt=tfidf.transform(x)

In [28]:
clf.predict(xt)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]])

In [29]:
multilabel.inverse_transform(clf.predict(xt))

[('python', 'sql')]

# Saving The Model

In [35]:
import pickle
pickle.dump(clf,open('Decision_Tree_multi_label.pkl','wb'))
pickle.dump(clf,open('tfidf_multilabel.pkl','wb'))