Libraries & Packages Setup

In [125]:
import pandas as pd
import numpy as np

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

Read Dataset

In [127]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv', index_col=0)

In [128]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [129]:
df['Tags'].iloc[0]

"['sql', 'asp.net']"

For lateral Evaluation (string to list conversion)

In [130]:
import ast

In [131]:
ast.literal_eval(df['Tags'].iloc[0])

['sql', 'asp.net']

In [132]:
df['Tags'] = df['Tags'].apply(lambda i: ast.literal_eval(i))

In [133]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


One-hot Encoding

In [134]:
y = df['Tags']
y

2          [sql, asp.net]
4              [c#, .net]
5                   [c++]
6                  [.net]
8                    [c#]
                ...      
1262668             [c++]
1262834             [c++]
1262915          [python]
1263065          [python]
1263454             [c++]
Name: Tags, Length: 48976, dtype: object

Top 20 classes of the dataset 

In [135]:
multiLB = MultiLabelBinarizer()

In [137]:
y = multiLB.fit_transform(df['Tags'])

In [138]:
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [139]:
multiLB.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [141]:
pd.DataFrame(y, columns = multiLB.classes_)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [150]:
tfidf = TfidfVectorizer(analyzer="word", max_features=5000, ngram_range=(1,2), stop_words='english')
x = tfidf.fit_transform(df['Text'])

In [151]:
tfidf.vocabulary_

{'aspnet': 448,
 'site': 3956,
 'maps': 2665,
 'got': 1902,
 'experience': 1591,
 'creating': 1111,
 'default': 1215,
 'xml': 4966,
 'file': 1680,
 'working': 4925,
 'properly': 3373,
 'menu': 2724,
 'controls': 1049,
 'need': 2852,
 'way': 4808,
 'users': 4652,
 'create': 1102,
 'modify': 2782,
 'pages': 3080,
 'page': 3077,
 'permissions': 3150,
 'standard': 4056,
 'membership': 2715,
 'xml file': 4967,
 'need way': 2864,
 'adding': 231,
 'scripting': 3805,
 'functionality': 1825,
 'net': 2872,
 'applications': 391,
 'little': 2549,
 'game': 1834,
 'written': 4953,
 'uses': 4653,
 'database': 1166,
 'backend': 523,
 'card': 696,
 'wanted': 4798,
 'implement': 2117,
 'function': 1814,
 'cards': 697,
 'mean': 2702,
 'essentially': 1533,
 'interface': 2253,
 'class': 794,
 'implements': 2122,
 'public': 3399,
 'contains': 1029,
 'called': 682,
 'make': 2641,
 'thing': 4334,
 'like': 2489,
 'source': 4005,
 'code': 854,
 'compile': 926,
 'use': 4628,
 'just': 2373,
 'add': 227,
 'tell': 

In [152]:
x

<48976x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1794988 stored elements in Compressed Sparse Row format>

In [153]:
x.shape, y.shape

((48976, 5000), (48976, 20))

In [154]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

Model

In [155]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC(C=1.5, penalty = 'l1', dual=False)

In [156]:
def jaccord_score(y_true, y_pred):
  j = np.minimum(y_true, y_pred).sum(axis=1) / np.maximum(y_true, y_pred).sum(axis=1)
  return j.mean()*100

def print_score(y_pred, classifier):
  print("Classifier: ", classifier.__class__.__name__)
  print('Jacard score: {}'.format(jaccord_score(y_test, y_pred)))
  print('----')

In [157]:
for i in [sgd, lr, svc]:
  clf = OneVsRestClassifier(i)
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  print_score(y_pred, i)

Classifier:  SGDClassifier
Jacard score: 52.486899414727105
----
Classifier:  LogisticRegression
Jacard score: 51.8368041377433
----
Classifier:  LinearSVC
Jacard score: 62.199877501020815
----
