In [1]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import string
import re
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from lxml import html
import nltk
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score,jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import gensim
from gensim.models import Word2Vec

In [2]:
np.random.seed(42)

In [3]:
data_cleaned = pd.read_csv('data_cleaned.csv',
                           converters={"preprocessedTags": lambda x: x.strip("[]").replace("'","").split(", ")})
data_cleaned = data_cleaned.dropna(subset=['desc'])
data_cleaned.head()

Unnamed: 0,Id,desc,preprocessedTags,Tag1,Tag2,Tag3,Tag4,Tag5
0,48320518,connect two differ databas one applic asp net ...,"[mysql, .net, sql-server]",mysql,.net,sql-server,,
1,48320543,bootstrap 4 navbar disappear resiz screen boot...,"[html, angular, bootstrap-4]",html,angular,bootstrap-4,,
2,48320558,xml transform xslt namespac xml transform xslt...,[xml],xml,,,,
3,48320572,convert timestamp date various format swift co...,"[ios, json, date, datetime]",ios,json,date,datetime,
4,44247,best practic requir time develop best practic ...,[project-management],project-management,,,,


In [4]:
sample = data_cleaned.sample(frac =.30)
sample.shape[0]

9852

# Train/Test split

In [5]:
X = sample['desc']
#Y = sample[['Tag1', 'Tag2', 'Tag3']].astype(str).values.tolist()
Y = sample['preprocessedTags']
mb = MultiLabelBinarizer()
Y_encoded = mb.fit_transform(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y_encoded, test_size=0.2, random_state=42)

In [6]:
print("Number of data points in training data :", X_train.shape[0])
print("Number of data points in test data :", X_test.shape[0])

Number of data points in training data : 7881
Number of data points in test data : 1971


# TF-IDF

In [7]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=5000)
X_train_multilabel = vectorizer.fit_transform(X_train)
X_test_multilabel = vectorizer.transform(X_test)

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000)
tf = tf_vectorizer.fit_transform(X_train)
tf_test = tf_vectorizer.fit_transform(X_test)

In [8]:
X_train_multilabel.shape

(7881, 3968)

# Word2Vec

https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec/

In [9]:
w2v = Word2Vec(list(X_train.str.split(' ')), size=250, window=5, min_count=20, workers=10, iter=20)

In [10]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in w2v.wv.vocab]
    return np.mean(w2v[doc], axis=0)

In [11]:
w2v_train = pd.DataFrame(X_train)
w2v_test = pd.DataFrame(X_test)

In [12]:
w2v_train['doc_vector'] = X_train.apply(document_vector)
w2v_test['doc_vector'] = X_test.apply(document_vector)

  return np.mean(w2v[doc], axis=0)


In [13]:
w2v_train = list(w2v_train['doc_vector'])
w2v_test = list(w2v_test['doc_vector'])

# Modeling

## With TF-IDF

https://www.quora.com/Whats-the-difference-between-gradient-descent-and-stochastic-gradient-descent

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
clf = OneVsRestClassifier(SGDClassifier())

parameters = {
    "estimator__loss": ['log','perceptron'],
    "estimator__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    "estimator__penalty":['l2', 'l1', 'elasticnet'],
}

model_tunning = GridSearchCV(clf, param_grid=parameters,
                             scoring='jaccard_micro', n_jobs=-1)

model_tunning.fit(X_train_multilabel, y_train)

print("Best score:", model_tunning.best_score_)
print("Param:", model_tunning.best_params_)

Best score: 0.3405155535407983
Param: {'estimator__alpha': 0.0001, 'estimator__loss': 'perceptron', 'estimator__penalty': 'elasticnet'}


In [16]:
clf = OneVsRestClassifier(SGDClassifier(loss='perceptron', alpha=0.0001, penalty='l1', n_jobs=11))
clf.fit(X_train_multilabel,
        y_train)
y_pred = clf.predict(X_test_multilabel)

In [17]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred, average = 'weighted'))

Accuracy : 0.1476407914764079
Weighted f1 score : 0.4530709141786762
Micro f1 score : 0.49860491071428575
Hamming loss : 0.005331695550392979
Jaccard weighted score : 0.3364332524505588


In [18]:
clf2 = OneVsRestClassifier(LogisticRegression(penalty='l2', n_jobs=-1))
clf2.fit(X_train_multilabel, y_train)
y_pred2 = clf2.predict(X_test_multilabel)

In [19]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred2))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred2, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred2, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred2))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred2, average = 'weighted'))

Accuracy : 0.09944190766108574
Weighted f1 score : 0.2903801207794255
Micro f1 score : 0.34595456357114385
Hamming loss : 0.004868843849858029
Jaccard weighted score : 0.2052330834339875


In [20]:
svc = OneVsRestClassifier(SVC(C=2,
                              kernel='rbf',
                              degree=1))
svc.fit(X_train_multilabel, y_train)
y_pred_svc = svc.predict(X_test_multilabel)

In [21]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred_svc))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred_svc, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred_svc, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred_svc))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred_svc, average = 'weighted'))

Accuracy : 0.17097919837645864
Weighted f1 score : 0.43476645096646427
Micro f1 score : 0.49585025604803107
Hamming loss : 0.004235389759702826
Jaccard weighted score : 0.32641824844154615


## With Word2Vec

In [22]:
clf = OneVsRestClassifier(SGDClassifier())

parameters = {
    "estimator__loss": ['log','perceptron'],
    "estimator__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    "estimator__penalty":['l2', 'l1', 'elasticnet'],
}

model_tunning = GridSearchCV(clf, param_grid=parameters,
                             scoring='jaccard_micro', n_jobs=11)

model_tunning.fit(np.array(w2v_train), y_train)

print("Best score:", model_tunning.best_score_)
print("Param:", model_tunning.best_params_)

Best score: 0.057735097744180355
Param: {'estimator__alpha': 0.0001, 'estimator__loss': 'perceptron', 'estimator__penalty': 'elasticnet'}


In [23]:
clf = OneVsRestClassifier(SGDClassifier(loss='perceptron', alpha=0.0001, penalty='l1', n_jobs=11))
clf.fit(w2v_train,
        y_train)
y_pred = clf.predict(np.array(w2v_test))

In [24]:
print("Accuracy :",metrics.accuracy_score(y_test,y_pred))
print("Weighted f1 score :",metrics.f1_score(y_test, y_pred, average = 'weighted'))
print("Micro f1 score :",metrics.f1_score(y_test, y_pred, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(y_test,y_pred))
print("Jaccard weighted score :",metrics.jaccard_score(y_test,y_pred, average = 'weighted'))

Accuracy : 0.015728056823947234
Weighted f1 score : 0.04785225199096601
Micro f1 score : 0.08958837772397095
Hamming loss : 0.007809138947487101
Jaccard weighted score : 0.02749999585953673


In [25]:
z

NameError: name 'z' is not defined

# Saving Model

In [None]:
pickle.dump(vectorizer, open('Flask/tfidf', 'wb'))
pickle.dump(clf, open('Flask/model', 'wb'))
pickle.dump(mb, open('Flask/mb', 'wb'))

In [None]:
classes_convertor = mb.classes_
classes_convertor = pd.DataFrame(classes_convertor)
classes_convertor.to_csv('Flask/classes_convertor.csv')

In [None]:
# -------------------------------------------------------------

In [None]:
clf = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l2', n_jobs=11))
clf.fit(X_train_multilabel, 
        y_train)
y_pred = clf.predict(X_test_multilabel)

In [None]:
mb.inverse_transform(y_pred)

# Unsupervized approach

In [26]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [27]:
n_components = 50
n_top_words = 20

In [28]:
nmf = NMF(n_components=n_components, random_state=42,
          alpha=.1, l1_ratio=.5)

nmf.fit(X_train_multilabel)

NMF(alpha=0.1, l1_ratio=0.5, n_components=50, random_state=42)

In [29]:
tfidf_feature_names = vectorizer.get_feature_names()

In [30]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=42)
lda.fit(tf)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=50, random_state=42)

In [31]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

As we actually have the different tags of each post, we'll try to analyze which topic goes with which tag