In [1]:
import time
from enum import Enum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans, DBSCAN
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA, LatentDirichletAllocation, NMF
from yellowbrick.cluster import KElbowVisualizer
from kneed import KneeLocator
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC

In [2]:
dataset = pd.read_csv('./cleaned_dataset.csv', converters={'Tags': lambda x: eval(x)})
dataset

Unnamed: 0,Body,Tags
0,uiimageview later uiimageview uiimageview cell...,[iphone]
1,requirement againstabout people traditional al...,"[c#, winforms]"
2,gem rake parse content dump character cause crash,"[ruby, xml]"
3,includes nested primary link sub straight exac...,[php]
4,searching django stuff django struggling searc...,"[python, sql, django]"
...,...,...
75358,rid vector ptr vector removed deleted obvious ...,[c++]
75359,singleton factory domain factory,[java]
75360,resharper removing directive perhaps configura...,"[c#, visual-studio-2008]"
75361,started document developing apps iphone wanted...,[iphone]


# Vectorizing data

Plusieurs méthodes nous permettent de transformer ce bag-of-words en données numérique. Nous allons créer une fonction de preprocessing permettant de la choisir, afin de les tester de manières indépendantes.

In [4]:
class Algorithm(Enum):
    KMeans = 'kmeans'
    MiniBatchKmeans = 'minikmeans'
    DBSCAN = 'dbscan'


class Vectorizer(Enum):
    Count = 'count'
    TfIdf = 'tf-idf'


def preprocessing(dataset: pd.DataFrame, vectorizer: Vectorizer):
    """ Add features engineering to the dataset """
    max_features = 3000

    # CountVectorizer
    if vectorizer == Vectorizer.Count:
        count_vectorizer = CountVectorizer(lowercase=False, max_features=max_features)
        count_matrix = count_vectorizer.fit_transform(dataset['Body'])
        df = pd.DataFrame(count_matrix.toarray(), index=dataset.index,
                          columns=count_vectorizer.get_feature_names_out())
        df.reset_index(inplace=True, drop=True)

    # Tf-Idf
    elif vectorizer == Vectorizer.TfIdf:
        tf_vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features, ngram_range=(1,2))
        tf_matrix = tf_vectorizer.fit_transform(dataset['Body'])
        df = pd.DataFrame(tf_matrix.toarray(), index=dataset.index,
                          columns=tf_vectorizer.get_feature_names_out())
        df.reset_index(inplace=True, drop=True)

    n_components = 1000
    pca = PCA(n_components=n_components)
    df_projected = pca.fit_transform(df)
    return (df_projected, df)


In [5]:
vectors = {name: preprocessing(dataset, vectorizer) for name, vectorizer in Vectorizer.__members__.items()}

# Unsupervized clustering

## KMeans
Pour ce test, nous utiliserons la version `MiniBatchKMeans` qui donne des résultats similaires mais permet une exécution beaucoup plus rapide. Nous passerons sur KMeans pour la production.

In [None]:
def kmeans_clusters(X):
    kelbow_viz = KElbowVisualizer(MiniBatchKMeans(random_state=5), k=(16, 24))
    kelbow_viz.fit(X)
    kelbow_viz.show()
    print(f'kelbow: {kelbow_viz.elbow_value_}')

    kmeans = MiniBatchKMeans(kelbow_viz.elbow_value_, random_state=5)
    kmeans.fit(X)
    labels = pd.Series(kmeans.labels_, name='cluster-label')
    value_dict = dict(labels.value_counts())
    value_counts = {str(k): int(v) for k, v in value_dict.items()}
    display(pd.Series(value_counts, name='clusters-size'))
    return labels

In [None]:
kmeans_labels = {}
for name, vector in vectors.items():
    print(f'KMeans with {name}')
    start = time.time()
    kmeans_labels[name] = kmeans_clusters(vector[0])
    print(f'Execution time: {time.time() - start}s')


### Visualisation

In [None]:
import nltk

def get_top_tags(df, label, nb_tags=5):
    tagsFreq = df[df['cluster_label'] == label]['tags'].to_frame().apply(lambda x : pd.Series([x['tags'], nltk.FreqDist(x['tags'])], index=['tokens', 'frequency']), axis=1, result_type='expand')
    total_freq = {}
    for dictionnary in tagsFreq['frequency']:
        for k,v in dictionnary.items():
            if k in total_freq:
                total_freq[k] += v
            else:
                total_freq[k] = v

    freq_list = sorted(total_freq.items(), key=lambda x: x[1], reverse=True)
    return freq_list[:nb_tags]

In [None]:
from wordcloud import WordCloud
n_cols = 2
n_rows = max([serie.max() for serie in kmeans_labels.values()]) + 1
fig = plt.figure(constrained_layout=True, figsize=(n_cols * 10 , n_rows * 5))
subfigures = fig.subfigures(1, len(kmeans_labels))
for vect_idx, (name, label) in enumerate(kmeans_labels.items()):
    dataset_labels = vectors[name][1].assign(cluster_label=label,
                           tags=dataset['Tags'])
    cluster_sum = dataset_labels.groupby('cluster_label').sum()
    subfigures[vect_idx].suptitle(name, fontsize='xx-large')
    for label_idx, label in enumerate(cluster_sum.index):
        top10_topics = cluster_sum.loc[label]\
                .sort_values(ascending=False)\
                .head(10).index.to_list()
        ax = subfigures[vect_idx].add_subplot(n_rows, 1, label_idx + 1)
        ax.set_title(f'Topic {label}', fontsize='xx-large')
        cloud = WordCloud().generate(' '.join(top10_topics))
        ax.imshow(cloud, interpolation='bilinear')
        ax.grid(False)
        ax.tick_params(axis='both', left=False, bottom=False, labelbottom=False, labelleft=False)
        ax.set_xlabel(' '.join([f'{tag} ({count})' for (tag, count) in get_top_tags(dataset_labels, label)]), fontsize='x-large')

plt.show()


### Scoring

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

for (name, vector), (_, labels) in zip(vectors.items(), kmeans_labels.items()):
    print(f'{name} Silhouette score : {silhouette_score(vector[0], labels)}')
    print(f'{name} Calinski-Harabasz score : {calinski_harabasz_score(vector[0], labels)}')
    print(f'{name} Davies-Bouldin score : {davies_bouldin_score(vector[0], labels)}')
    

## DBSCAN

In [None]:
def make_dbscan_clusters(X):
    # Need to sample to avoid too much resources comsuption
    X_sample = X[0:30000]
    nb_neighbors = 10
    nearest_neighbors = NearestNeighbors(n_neighbors=nb_neighbors)
    nearest_neighbors.fit(X_sample)
    distances, _ = nearest_neighbors.kneighbors(X_sample)

    # Get max distance between neighbors
    max_distances = np.sort(distances[:, nb_neighbors - 1])

    # Find an elbow
    index = np.arange(len(max_distances))
    knee = KneeLocator(index, max_distances, curve='convex',
                       direction='increasing', interp_method='polynomial')
    knee.plot_knee(figsize=(10, 10))
    plt.xlabel("Points")
    plt.ylabel("Distance")
    plt.show()

    dbscan = DBSCAN(min_samples=100, eps=knee.elbow_y)
    dbscan.fit(X_sample)
    print(f'Nombre de clusters : {len(dbscan.labels_)}')
    labels = pd.Series(dbscan.labels_, name='cluster-label')
    value_dict = dict(labels.value_counts())
    value_counts = {str(k): int(v) for k, v in value_dict.items()}
    display(pd.Series(value_counts, name='clusters-size'))
    return labels

In [None]:
dbscan_labels = {}
for name, vector in vectors.items():
    print(f'DBSCAN with {name}')
    dbscan_labels[name] = make_dbscan_clusters(vector[0])

DBSCAN ne semble pas à l'aise avec ces données et nécessite une puissance bien plus grande que KMeans. Nous allons donc le laisser de côté.

## Latent Dirichlet Allocation (LDA)

In [None]:
%%time
# Fix the number of topics near the KMeans elbow
no_topics = 17
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online',
                                         learning_offset=50., random_state=0).fit(vectors['Count'][1])

### Visualization

In [None]:
no_top_words = 10
for topic_idx, topic in enumerate(lda.components_):
    cloud = WordCloud().generate(" ".join([vectors['Count'][1].columns[i]
                    for i in topic.argsort()[:-no_top_words - 1:-1]]))
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Topic {topic_idx}')
    plt.show()


## Scoring

In [None]:
print(f'Log-likelihood: {lda.score(vectors["Count"][1])}')
print(f'Perplexity: {lda.perplexity(vectors["Count"][1])}')

# Supervized clustering

In [None]:
sample = 30000
X_train, X_test, y_train, y_test = train_test_split(vectors['Count'][0][:sample], dataset['Tags'].iloc[:sample], test_size=0.2, random_state=34)

mlb = MultiLabelBinarizer().fit(dataset['Tags'].iloc[:sample].to_list())
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

In [None]:
%%time
ovr = OneVsRestClassifier(LogisticRegression(n_jobs=-1, max_iter=500), n_jobs=-1)
ovr.fit(X_train, y_train)

In [None]:
%%time
ovr_score = ovr.score(X_test, y_test)
ovr_score

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
ovr = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1), n_jobs=-1)
ovr.fit(X_train, y_train)

In [None]:
%%time
ovr_score = ovr.score(X_test, y_test)
ovr_score

In [None]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
ovr = OneVsRestClassifier(estimator=GradientBoostingClassifier(), n_jobs=-1)
ovr.fit(X_train, y_train)

In [None]:
%%time
ovr_score = ovr.score(X_test, y_test)
ovr_score

In [None]:
%%time
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
multi = MultiOutputClassifier(KNeighborsClassifier(n_jobs=-1), n_jobs=-1).fit(X_train, y_train)

In [None]:
%%time
multi.score(X_test, y_test)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(vectors['TfIdf'][1], dataset['Tags'], test_size=0.2, random_state=34)

mlb = MultiLabelBinarizer().fit(dataset['Tags'].to_list())
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

In [7]:
from sklearn.naive_bayes import MultinomialNB
ovr = OneVsRestClassifier(MultinomialNB())
print(ovr)
ovr.fit(X_train, y_train)
score = ovr.score(X_test, y_test)
print(score)

OneVsRestClassifier(estimator=MultinomialNB())
0.08744111988323493


In [8]:
%%time
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_train, y_train)

CPU times: user 28min 21s, sys: 4.92 s, total: 28min 26s
Wall time: 1min 51s


RandomForestClassifier(n_jobs=-1)

In [9]:
rfc.score(X_test, y_test)

0.19392290851190871

In [10]:
y_pred = rfc.predict(X_test)

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

NameError: name 'y_pred' is not defined

In [12]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='micro')

0.3660966742964858

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=1e-5, penalty='l1'), n_jobs=-1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
f1_score(y_test, y_pred, average='micro')

0.3962196745802345

In [13]:
mlb.classes_

array(['.net', 'ajax', 'algorithm', 'android', 'arrays', 'asp.net',
       'asp.net-mvc', 'c', 'c#', 'c++', 'css', 'database', 'django',
       'eclipse', 'excel', 'html', 'ios', 'iphone', 'java', 'javascript',
       'jquery', 'linq', 'linux', 'macos', 'multithreading', 'mysql',
       'objective-c', 'oracle', 'performance', 'php', 'python', 'regex',
       'ruby', 'ruby-on-rails', 'security', 'sql', 'sql-server', 'string',
       'svn', 'unit-testing', 'user-interface', 'vb.net', 'visual-studio',
       'visual-studio-2008', 'web-services', 'winapi', 'windows',
       'winforms', 'wpf', 'xml'], dtype=object)

In [15]:
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

                    precision    recall  f1-score   support

              .net       0.46      0.08      0.14      1338
              ajax       0.42      0.10      0.17       174
         algorithm       0.67      0.26      0.37       171
           android       0.85      0.54      0.66       472
            arrays       0.71      0.07      0.14       201
           asp.net       0.78      0.27      0.40       997
       asp.net-mvc       0.67      0.27      0.39       269
                 c       0.61      0.26      0.36       379
                c#       0.65      0.24      0.35      2282
               c++       0.76      0.33      0.46       921
               css       0.77      0.36      0.49       449
          database       0.36      0.05      0.09       340
            django       0.90      0.67      0.77       167
           eclipse       0.52      0.08      0.13       155
             excel       0.83      0.60      0.70       126
              html       0.66      0.13

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
model = Pipeline([
    # ADD lemmisation
    ('td-idf', TfidfVectorizer(lowercase=False, max_features=3000, ngram_range=(1,2))),
    ('pca', TruncatedSVD(n_components=500, random_state=12)), # supports sparse matrix
    ('svc', OneVsRestClassifier(SGDClassifier(loss='log', alpha=1e-5, penalty='l1', random_state=9), n_jobs=-1))
])
X_train, X_test, y_train, y_test = train_test_split(dataset['Body'], dataset['Tags'], test_size=0.2, random_state=5)
mlb = MultiLabelBinarizer().fit(dataset['Tags'].to_list())
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [16]:
print(f1_score(y_test, y_pred, average='micro'))
print(classification_report(y_test, y_pred))

0.31795919805503686
              precision    recall  f1-score   support

           0       0.43      0.04      0.07      1286
           1       0.50      0.04      0.07       198
           2       0.68      0.28      0.40       175
           3       0.85      0.39      0.54       473
           4       0.67      0.09      0.16       201
           5       0.80      0.19      0.31       949
           6       0.65      0.10      0.17       245
           7       0.53      0.21      0.30       409
           8       0.68      0.15      0.25      2189
           9       0.78      0.26      0.39       966
          10       0.77      0.27      0.40       448
          11       0.45      0.03      0.05       333
          12       0.89      0.60      0.72       167
          13       0.25      0.02      0.04       158
          14       0.78      0.55      0.65       148
          15       0.58      0.09      0.16       636
          16       0.32      0.17      0.22       142
       

  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
for index in range(0,10):
    print(f'Post: {X_test.iloc[index]}')
    print(f'Prediction: {mlb.inverse_transform(np.array([y_pred[index]]))}')
    print(f'True Labels: {mlb.inverse_transform(np.array([y_test[index]]))}')

Post: programmer year total professionally little company developer began day benefit databinding query directly querying manually adding approach generally considered cleaner simpler term deploying case manually adding grained asking assigned bug deal populate love clean unnecessary logic push library
Prediction: [()]
True Labels: [('.net', 'c#', 'sql-server')]
Post: implementing nhibernate existing process bulk inserting updating nhibernate aware occurring backend initiated nhibernate nhibernate mentioned storing httpcontext callcontext duration lifecycle implemented afraid cost initializing nhibernate significant performance hit approach initializing sense sessionfactory httpcontext callcontext mapping regenerated
Prediction: [()]
True Labels: [('asp.net',)]
Post: sketch nx sketch sketch geometric shape customer sketch supposed serve cross section sketch supposed extrude written purpose gap believe sketch yet gap kindly tell written gap sketch thank strict offimports systemimports n

In [18]:
from mlflow.sklearn import save_model
from mlflow.models.signature import infer_signature
signature = infer_signature(X_test, y_pred)
save_model(model, "my_model", signature=signature)

MlflowException: Path 'my_model' already exists

## TODO
* Joblib in streamlit

## SEE
* Data augmentation