In [5]:
import os
import re

import gensim
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import unidecode
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from tqdm import tqdm

In [6]:
def jaccard_score(A, B, n=0):
    """calcul jaccard_score between A and B dataframe regarding to a selected column and n neighbors

    Args:
        A (pd.DataFrame): first dataframe with neighbors words
        B (pd.DataFrame): second dataframe with neighbors words
        n (int, optional): number of neighbors to calc Jaccard score. Defaults to 0.

    Returns:
        float: return jaccard score
    """
    if n == 0: n = min(len(A), len(B))
    a = A[:n]
    b = B[:n]
    union = len(set(a + b))
    inter = len([e for e in a if e in b])
    return inter/union

def concat_similarity_df(df):
    '''
    concat columns of similarity df to consider only on "keywords group"
    '''
    com_key = []
    for i in df:
        com_key += df[i].tolist()
        
    return list(filter(lambda x: x!='__nokey__', com_key))

def calc_aj(models, colname='similarity_df'):
    """calculate average jaccard for multiple keywords and plot matrix with plotly

    Args:
        models (dict): models dict from train_word2vec
        keywords (list): list of keywords (refered to columns in df) to compute aj on
    """
    aj_matrix = []
    for community in tqdm(models):
        ref_keys = concat_similarity_df(df=models.get(community).get(colname))
        ajs = []
        for to_compare in models:
            comp_keys = concat_similarity_df(df=models.get(to_compare).get(colname))
            
            if community == to_compare: ajs.append(1); continue
            
            ajs.append(jaccard_score(ref_keys, comp_keys, n=len(ref_keys)))
            # ajs.append(np.mean([jaccard_score(ref_keys, comp_keys, n=n) for n in range(1,max(len(ref_keys),len(comp_keys)))]))
            
        aj_matrix.append(ajs)
        # fig = go.Figure(data=go.Heatmap(
        #     z=aj_matrix,
        #     # x=list(models.keys()),
        #     # y=list(models.keys()),
        #     colorscale='Viridis')
        # )
        # fig.show()
    return aj_matrix

def light_prepro(mot):
    """clean string of accent and useless space

    Args:
        mot (str): string

    Returns:
        str: cleaned string
    """
    return unidecode.unidecode(mot.lower().strip())


tsne hyperparam optimization

In [8]:
models = {}
for file in os.listdir('/home/matthieu-inspiron/tmp/data/'):
    name = file.split('.')[0].split('_')
    if name[2] not in models: models[name[2]] = {}
    if name[1] == 'hashtag':
        models[name[2]]['hashtag_sim'] = pd.read_csv(f'/home/matthieu-inspiron/tmp/data/{file}')
    else:
        models[name[2]]['similarity_df'] = pd.read_csv(f'/home/matthieu-inspiron/tmp/data/{file}')
        

In [9]:
models.get('66').get('similarity_df').columns
## on va se concentrer sur une ensemble réduit dekeyword pour le moment
keywords = ['climat', 'environnement', 'nucléaire', 'éolien', 'recyclage', 'carbone', 'nature', 'pollution']
for v in models.values():
    v['similarity_df'] = v.get('similarity_df')[keywords]

In [10]:
aj_matrix = calc_aj(models=models)


100%|██████████| 46/46 [00:00<00:00, 50.34it/s]


In [17]:
# inversion des distances de la matrice
true_aj_matrix = list(map(lambda x: [1-i for i in x], aj_matrix))

In [18]:
fig = go.Figure(data=go.Heatmap(
                z=true_aj_matrix,
                x=list(models.keys()),
                y=list(models.keys()),
                colorscale='Viridis'))
fig.show()

# TSNE


In [12]:
import plotly.express as px
from plotly.subplots import make_subplots


In [20]:
n_components = 2
fig = make_subplots(rows=1, cols=5, subplot_titles=([f'perplexity: {i}' for i in [2,5,30,50,100]]))

for i, p in enumerate([2,5,30,50,100]):
    tsne = TSNE(n_components, perplexity=p, learning_rate=max(len(true_aj_matrix)/12/4, 50), metric='precomputed', n_iter=5000)
    tsne_result = tsne.fit_transform(true_aj_matrix)
    tsne_result_df = pd.DataFrame({'tsne_1': tsne_result[:,0], 'tsne_2': tsne_result[:,1], 'community': list(models.keys())})
    fig.add_trace(
        go.Scatter(x=tsne_result_df["tsne_1"], y=tsne_result_df["tsne_2"], mode="markers+text",text=tsne_result_df['community'], textposition="bottom center"),
        row=1,col=i+1
    )

    # fig = px.scatter(tsne_result_df, x="tsne_1", y="tsne_2", 
    #              text='community')
    # fig.update_traces(textposition="bottom right")
fig.show()













In [14]:
tsne = TSNE(n_components, perplexity=50, learning_rate=100, metric='precomputed', n_iter=5000)
tsne_result = tsne.fit(aj_matrix)





In [15]:
tsne_result.n_iter_

649