In [24]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import string
import re
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from lxml import html
import nltk
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score,jaccard_score

In [2]:
np.random.seed(42)

In [3]:
data_cleaned = pd.read_csv('data_cleaned.csv')
data_cleaned = data_cleaned.dropna(subset=['desc'])
data_cleaned.head()

Unnamed: 0,Id,desc,Tags_
0,48320518,connect two differ databas one applic asp net ...,mysql .net sql-server
1,48320543,bootstrap navbar disappear resiz screen bootst...,html angular bootstrap-4
2,48320558,xml transform xslt namespac xml transform xslt...,xml xslt
3,48320572,convert timestamp date various format swift co...,ios json date datetime swift3
4,44247,best practic requir time develop best practic ...,project-management time-management


In [4]:
sample = data_cleaned.sample(frac =.10)
sample.shape[0]

3284

# Train/Test split

In [5]:
X = sample[['desc']]
Y = sample[['Tags_']]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [6]:
print("Number of data points in training data :", X_train.shape[0])
print("Number of data points in test data :", X_test.shape[0])

Number of data points in training data : 2627
Number of data points in test data : 657


# TF-IDF

In [7]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=5000)
X_train_multilabel = vectorizer.fit_transform(X_train['desc'])
X_test_multilabel = vectorizer.transform(X_test['desc'])

In [8]:
print("Training data shape X : ",X_train_multilabel.shape, "Y :",y_train.shape)
print("Test data shape X : ",X_test_multilabel.shape,"Y:",y_test.shape)

Training data shape X :  (2627, 3792) Y : (2627, 1)
Test data shape X :  (657, 3792) Y: (657, 1)


# Modeling

In [9]:
clf = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l2', n_jobs=-1))
clf.fit(X_train_multilabel, y_train)
y_pred = clf.predict(X_test_multilabel)

In [29]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred))
print("Macro f1 score :",metrics.f1_score(y_test, y_pred, average = 'macro'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred, average = 'weighted'))

Accuracy : 0.0898021308980213
Macro f1 score : 0.018161660418686183
Micro f1 score : 0.0898021308980213
Hamming loss : 0.9101978691019786
Hamming loss : 0.04701195219123506


In [11]:
clf2 = OneVsRestClassifier(LogisticRegression(penalty='l2', n_jobs=-1))
clf2.fit(X_train_multilabel, y_train)
y_pred2 = clf2.predict(X_test_multilabel)

In [32]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred2))
print("Macro f1 score :",metrics.f1_score(y_test, y_pred2, average = 'macro'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred2, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred2))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred2, average = 'weighted'))

Accuracy : 0.0882800608828006
Macro f1 score : 0.00451581306073291
Micro f1 score : 0.0882800608828006
Hamming loss : 0.9117199391171994
Jaccard micro score : 0.04617834394904458


In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [21]:
svc = OneVsRestClassifier(SVC())

parameters = {
    "estimator__C": [1,2],
    "estimator__kernel": ["rbf"],
    "estimator__degree":[1, 2],
}

model_tunning = GridSearchCV(svc, param_grid=parameters,
                             scoring='jaccard_micro', n_jobs=-1)

model_tunning.fit(X_train_multilabel, y_train)

print("Best score:", model_tunning.best_score_)
print("Param:", model_tunning.best_params_)



Best score: 0.01732369761464761
Param: {'estimator__C': 2, 'estimator__degree': 1, 'estimator__kernel': 'rbf'}


In [22]:
svc = OneVsRestClassifier(SVC(C=2,
                              kernel='rbf',
                              degree=1))
svc.fit(X_train_multilabel, y_train)
y_pred_svc = svc.predict(X_test_multilabel)

In [34]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred_svc))
print("Macro f1 score :",metrics.f1_score(y_test, y_pred_svc, average = 'macro'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred_svc, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred_svc))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred_svc, average = 'weighted'))

Accuracy : 0.0837138508371385
Macro f1 score : 0.015315249748180814
Micro f1 score : 0.0837138508371385
Hamming loss : 0.9162861491628614
Jaccard weighted score : 0.04114551934858139


In [None]:
rf = OneVsRestClassifier(RandomForestClassifier(
                         n_estimators=200,
                         max_depth=10,
                         n_jobs=-1))
rf.fit(X_train_multilabel, y_train)
y_pred_rf = rf.predict(X_test_multilabel)

In [31]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred_rf))
print("Macro f1 score :",metrics.f1_score(y_test, y_pred_rf, average = 'macro'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred_rf, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred_rf))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred_rf, average = 'weighted'))

NameError: name 'y_pred_rf' is not defined

# Saving Model

In [35]:
pickle.dump(vectorizer, open('tfidf', 'wb'))
pickle.dump(clf2, open('model', 'wb'))