In [1]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import string
import re
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from lxml import html
import nltk
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score,jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
np.random.seed(42)

In [3]:
data_cleaned = pd.read_csv('data_cleaned.csv')
data_cleaned = data_cleaned.dropna(subset=['desc'])
data_cleaned.head()

Unnamed: 0,Id,desc,preprocessedTags,Tag1,Tag2,Tag3,Tag4,Tag5
0,48320518,connect two differ databas one applic asp net ...,"['mysql', '.net', 'sql-server']",mysql,.net,sql-server,,
1,48320543,bootstrap 4 navbar disappear resiz screen boot...,"['html', 'angular', 'bootstrap-4']",html,angular,bootstrap-4,,
2,48320558,xml transform xslt namespac xml transform xslt...,['xml'],xml,,,,
3,48320572,convert timestamp date various format swift co...,"['ios', 'json', 'date', 'datetime']",ios,json,date,datetime,
4,44247,best practic requir time develop best practic ...,['project-management'],project-management,,,,


In [4]:
sample = data_cleaned.sample(frac =.10)
sample.shape[0]

3284

# Train/Test split

In [5]:
X = sample['desc']
Y = sample[['Tag1', 'Tag2', 'Tag3']].replace(np.nan, '', regex=True).astype(str).values.tolist()
mb = MultiLabelBinarizer()
Y_encoded = mb.fit_transform(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y_encoded, test_size=0.2, random_state=42)

In [6]:
print("Number of data points in training data :", X_train.shape[0])
print("Number of data points in test data :", X_test.shape[0])

Number of data points in training data : 2627
Number of data points in test data : 657


In [7]:
Y

[['android', 'shell', ''],
 ['c++', 'visual-c++', 'static'],
 ['java', 'recursion', ''],
 ['javascript', 'async-await', ''],
 ['html', 'css', ''],
 ['javascript', 'floating-point', ''],
 ['asp.net', 'css', ''],
 ['c++', 'c++11', ''],
 ['coding-style', 'boolean', ''],
 ['bash', 'printf', ''],
 ['string', 'rust', ''],
 ['git', 'github', ''],
 ['php', 'html', 'mysql'],
 ['python', 'numpy', 'math'],
 ['javascript', 'jquery', ''],
 ['java', 'spring', 'spring-security'],
 ['javascript', '', ''],
 ['r', 'dplyr', 'aggregate'],
 ['r', 'string', ''],
 ['c++', 'opengl', 'matrix'],
 ['php', '', ''],
 ['', '', ''],
 ['android', '', ''],
 ['r', '', ''],
 ['python', 'api', 'docker'],
 ['java', 'arrays', 'object'],
 ['bash', 'ubuntu', ''],
 ['python', '', ''],
 ['mongodb', '', ''],
 ['ios', 'swift', 'firebase'],
 ['r', 'matrix', 'dataframe'],
 ['android', 'android-layout', ''],
 ['java', 'datetime', 'jsp'],
 ['javascript', '', ''],
 ['c++', 'pointers', 'templates'],
 ['java', 'android', ''],
 ['c#', '

# TF-IDF

In [8]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=5000)
X_train_multilabel = vectorizer.fit_transform(X_train)
X_test_multilabel = vectorizer.transform(X_test)

In [9]:
X_train_multilabel.shape

(2627, 3796)

# Modeling

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
clf = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l2', n_jobs=-1))
clf.fit(X_train_multilabel, 
        y_train)
y_pred = clf.predict(X_test_multilabel)

In [12]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred, average = 'weighted'))

Accuracy : 0.136986301369863
Weighted f1 score : 0.5055419863793399
Micro f1 score : 0.5711222301644031
Hamming loss : 0.005435964340073929
Jaccard weighted score : 0.3885729378497085


In [13]:
clf2 = OneVsRestClassifier(LogisticRegression(penalty='l2', n_jobs=-1))
clf2.fit(X_train_multilabel, y_train)
y_pred2 = clf2.predict(X_test_multilabel)

In [14]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred2))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred2, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred2, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred2))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred2, average = 'weighted'))

Accuracy : 0.0837138508371385
Weighted f1 score : 0.37247622154564947
Micro f1 score : 0.4908310573546625
Hamming loss : 0.005911611219830398
Jaccard weighted score : 0.28848948505213


In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [16]:
"""svc = OneVsRestClassifier(SVC())

parameters = {
    "estimator__C": [1,2],
    "estimator__kernel": ["rbf"],
    "estimator__degree":[1, 2],
}

model_tunning = GridSearchCV(svc, param_grid=parameters,
                             scoring='jaccard_micro', n_jobs=-1)

model_tunning.fit(X_train_multilabel, y_train)

print("Best score:", model_tunning.best_score_)
print("Param:", model_tunning.best_params_)"""

'svc = OneVsRestClassifier(SVC())\n\nparameters = {\n    "estimator__C": [1,2],\n    "estimator__kernel": ["rbf"],\n    "estimator__degree":[1, 2],\n}\n\nmodel_tunning = GridSearchCV(svc, param_grid=parameters,\n                             scoring=\'jaccard_micro\', n_jobs=-1)\n\nmodel_tunning.fit(X_train_multilabel, y_train)\n\nprint("Best score:", model_tunning.best_score_)\nprint("Param:", model_tunning.best_params_)'

In [17]:
svc = OneVsRestClassifier(SVC(C=2,
                              kernel='rbf',
                              degree=1))
svc.fit(X_train_multilabel, y_train)
y_pred_svc = svc.predict(X_test_multilabel)

In [18]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred_svc))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred_svc, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred_svc, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred_svc))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred_svc, average = 'weighted'))

Accuracy : 0.1308980213089802
Weighted f1 score : 0.4641425936894655
Micro f1 score : 0.5567928730512249
Hamming loss : 0.005408784518373559
Jaccard weighted score : 0.36169478042555525


In [19]:
rf = OneVsRestClassifier(RandomForestClassifier(
                         n_estimators=200,
                         max_depth=10,
                         n_jobs=-1))
rf.fit(X_train_multilabel, y_train)
y_pred_rf = rf.predict(X_test_multilabel)

In [20]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred_rf))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred_rf, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred_rf, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred_rf))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred_rf, average = 'weighted'))

Accuracy : 0.0517503805175038
Weighted f1 score : 0.2866405164014469
Micro f1 score : 0.43208383716243454
Hamming loss : 0.006382728129303471
Jaccard weighted score : 0.22828252479140299


# Saving Model

In [21]:
pickle.dump(vectorizer, open('Flask/tfidf', 'wb'))
pickle.dump(clf, open('Flask/model', 'wb'))
pickle.dump(mb, open('Flask/mb', 'wb'))

In [22]:
classes_convertor = mb.classes_
classes_convertor = pd.DataFrame(classes_convertor)
classes_convertor.to_csv('Flask/classes_convertor.csv')

In [23]:
# -------------------------------------------------------------

In [24]:
clf = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l2', n_jobs=-1))
clf.fit(X_train_multilabel, 
        y_train)
y_pred = clf.predict(X_test_multilabel)

In [25]:
mb.inverse_transform(y_pred)

[('', 'linux'),
 ('', 'sql', 'sql-server'),
 ('html', 'javascript', 'jquery'),
 ('', 'javascript'),
 ('php',),
 ('',),
 ('', 'java'),
 ('', 'ios', 'swift'),
 ('', 'php'),
 ('', 'javascript'),
 (),
 ('apache-spark', 'pyspark', 'python'),
 ('java',),
 ('', 'python'),
 ('arrays', 'java'),
 ('',),
 ('', 'css', 'html'),
 ('', 'date'),
 ('',),
 ('', 'python'),
 ('',),
 ('', 'r'),
 ('',),
 ('android', 'java'),
 ('', 'php'),
 ('html',),
 ('',),
 (),
 ('', 'javascript'),
 ('', 'python'),
 ('',),
 ('',),
 ('java', 'java-stream'),
 ('', 'javascript'),
 ('',),
 ('',),
 ('', 'java'),
 ('', 'c'),
 ('pandas',),
 ('', 'c#'),
 ('',),
 ('javascript',),
 ('', 'javascript'),
 (),
 (),
 (),
 (),
 ('', 'r'),
 ('',),
 ('', 'java'),
 ('', 'vim'),
 ('', 'python'),
 ('',),
 ('',),
 ('css', 'html', 'javascript'),
 ('php',),
 ('',),
 (),
 ('dictionary', 'python'),
 ('', 'android', 'java'),
 ('', 'css'),
 ('', 'java'),
 ('', 'python'),
 ('', 'java'),
 ('', 'python'),
 (),
 ('', 'rust'),
 ('', 'firebase'),
 ('', 'p