# Data clean provisoire

Cette partie consiste à faire un pré-traitement rapide des données 
afin de vérifier l'implémentation du modèle : LATENT DIRICHLET ALLOCATION

LDA est une des techniques de modélisation de Topics.

Notre but est d'obtenir des thèmes issus des titres des publications.

In [3]:
import os
import pandas as pd

filesnames=os.listdir(os.getcwd() )
filesnames = [f for f in filesnames if (f.lower().endswith(".csv"))]

print(filesnames)

['publication_year.csv', 'author.csv', 'year.csv', 'publication.csv', 'publication_venue.csv', 'keyword.csv', 'venue.csv', 'publication_author.csv', 'Publication_keywords.csv']


In [5]:
data_Publications = pd.read_csv('publication.csv',sep=',',encoding='cp1252')

df_Publications = pd.DataFrame(data_Publications)


In [11]:
#Pour tester le LDA, je sélectionne les 100 first publications
df_Publications = df_Publications.head(100)
#del df_Publications['categorie'] .... Cette ligne m'a permis de supprimer les colonnes non désirées

#Transformation de la colonne id_publication en index du dataframe
df_Publications = df_Publications.set_index('id_publication')

df_Publications

Unnamed: 0_level_0,article_title
id_publication,Unnamed: 1_level_1
conf/3dim/AbarghoueiB19,To Complete or to Estimate. That is the Questi...
conf/3dim/AbbeloosCCTD17,3D Object Discovery and Modeling Using Single ...
conf/3dim/AbrevayaWB18,Spatiotemporal Modeling for Efficient Registra...
conf/3dim/AdanR19,Reconstruction of As-is Semantic 3D Models of ...
conf/3dim/AhmedMG17,Point Cloud Registration with Virtual Interest...
...,...
conf/3dim/KarakottasZSAGZ19,360° Surface Regression with a Hyper-Sphere Loss.
conf/3dim/KasperH19,Multiple Point Light Estimation from Low-Quali...
conf/3dim/KaushikL19,UnDispNet: Unsupervised Learning for Multi-Sta...
conf/3dim/KimMQ18,Multi-planar Monocular Reconstruction of Manha...


In [12]:
#Suppression du texte dans les crochets, de la ponctuation et des mots contenant des nombres.

import re
import string

def clean_text_round1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [14]:

data_clean = pd.DataFrame(df_Publications.article_title.apply(round1))
data_clean

Unnamed: 0_level_0,article_title
id_publication,Unnamed: 1_level_1
conf/3dim/AbarghoueiB19,to complete or to estimate that is the questio...
conf/3dim/AbbeloosCCTD17,object discovery and modeling using single rg...
conf/3dim/AbrevayaWB18,spatiotemporal modeling for efficient registra...
conf/3dim/AdanR19,reconstruction of asis semantic models of uno...
conf/3dim/AhmedMG17,point cloud registration with virtual interest...
...,...
conf/3dim/KarakottasZSAGZ19,° surface regression with a hypersphere loss
conf/3dim/KasperH19,multiple point light estimation from lowqualit...
conf/3dim/KaushikL19,undispnet unsupervised learning for multistage...
conf/3dim/KimMQ18,multiplanar monocular reconstruction of manhat...


In [15]:

def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [73]:

data_clean = pd.DataFrame(data_clean.article_title.apply(round2))
data_clean

Unnamed: 0_level_0,article_title
id_publication,Unnamed: 1_level_1
conf/3dim/AbarghoueiB19,to complete or to estimate that is the questio...
conf/3dim/AbbeloosCCTD17,object discovery and modeling using single rg...
conf/3dim/AbrevayaWB18,spatiotemporal modeling for efficient registra...
conf/3dim/AdanR19,reconstruction of asis semantic models of uno...
conf/3dim/AhmedMG17,point cloud registration with virtual interest...
...,...
conf/3dim/KarakottasZSAGZ19,° surface regression with a hypersphere loss
conf/3dim/KasperH19,multiple point light estimation from lowqualit...
conf/3dim/KaushikL19,undispnet unsupervised learning for multistage...
conf/3dim/KimMQ18,multiplanar monocular reconstruction of manhat...


#### Creation of our Document-Term Matrix

In [17]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.article_title)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0_level_0,accelerated,accurate,ace,acquisition,active,adapting,adaptive,adjustment,adversarial,adversarially,...,view,viewpoints,views,virtual,vision,visual,weak,weakly,wild,world
id_publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
conf/3dim/AbarghoueiB19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AbbeloosCCTD17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AbrevayaWB18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AdanR19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AhmedMG17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
conf/3dim/KarakottasZSAGZ19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KasperH19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KaushikL19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KimMQ18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Let's pickle it for later use
#Cette commande permet d'enrégistrer un DataFrame obtenu après plusieurs traitements
#afin de le réutiliser dans d'autres notebook jupyter
import pickle
data_dtm.to_pickle("dtm.pkl")

In [19]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

# Latent Dirichlet Allocation

Pour l'instant, cette analyse n'est pas encore très pertinente parce qu'on ne s'interesse qu'à une très petite
plage de publications. 
Après le Data cleaning, on pourra récupérer les mots qui sonts les plus récurrents et surtout, on 
pourra utiliser toutes les publications pour l'analyse. 

## Topic Modeling - Tentative 1

In [20]:

data = pd.read_pickle('dtm.pkl')
data

Unnamed: 0_level_0,accelerated,accurate,ace,acquisition,active,adapting,adaptive,adjustment,adversarial,adversarially,...,view,viewpoints,views,virtual,vision,visual,weak,weakly,wild,world
id_publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
conf/3dim/AbarghoueiB19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AbbeloosCCTD17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AbrevayaWB18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AdanR19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AhmedMG17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
conf/3dim/KarakottasZSAGZ19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KasperH19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KaushikL19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KimMQ18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
#Dans le terminal : conda install -c conda-forge gensim

#importation des modules nécessaires pour la LDA
from gensim import matutils, models
import scipy.sparse

In [68]:

tdm = data.transpose()
tdm.head()

id_publication,conf/3dim/AbarghoueiB19,conf/3dim/AbbeloosCCTD17,conf/3dim/AbrevayaWB18,conf/3dim/AdanR19,conf/3dim/AhmedMG17,conf/3dim/AhujaM19,conf/3dim/Al-HamiL17,conf/3dim/AlexandrovPZV17,conf/3dim/AliGS18,conf/3dim/AlldieckMXTP18,...,conf/3dim/JackMES17,conf/3dim/Jaramillo17,conf/3dim/JaritzCWPN18,conf/3dim/JiangPFFD17,conf/3dim/KanoOTIHM17,conf/3dim/KarakottasZSAGZ19,conf/3dim/KasperH19,conf/3dim/KaushikL19,conf/3dim/KimMQ18,conf/3dim/KimRJFH17
accelerated,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accurate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
ace,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acquisition,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
active,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
#On met term-document matrix dans un format gensim pour puvoir appliquer LDA
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)


In [70]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
#Chaque mot (k) est associé au numéro de ligne (v)

In [71]:
#id2word

In [72]:
#Implémentation du modèle 

lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, passes=300)
lda.print_topics()

#Le but ici est de faire varier le nombre de topics et/ou le nombre d'itérations à effectuer par
#le modèle

[(0,
  '0.032*"efficient" + 0.022*"learning" + 0.022*"estimation" + 0.022*"largescale" + 0.011*"structurefrommotion" + 0.011*"light" + 0.011*"scenes" + 0.011*"models" + 0.011*"unified" + 0.011*"sensors"'),
 (1,
  '0.020*"monocular" + 0.020*"stereo" + 0.020*"reconstruction" + 0.020*"surface" + 0.020*"tracking" + 0.020*"estimation" + 0.010*"underwater" + 0.010*"scenes" + 0.010*"rgbd" + 0.010*"localization"'),
 (2,
  '0.038*"estimation" + 0.031*"depth" + 0.023*"single" + 0.016*"pose" + 0.016*"image" + 0.016*"scene" + 0.016*"using" + 0.016*"rigid" + 0.016*"motion" + 0.008*"body"'),
 (3,
  '0.025*"using" + 0.025*"images" + 0.017*"human" + 0.017*"fast" + 0.017*"pose" + 0.017*"multiview" + 0.017*"disparity" + 0.017*"estimation" + 0.009*"maps" + 0.009*"evaluation"'),
 (4,
  '0.022*"detection" + 0.022*"incremental" + 0.012*"object" + 0.012*"visual" + 0.012*"monocular" + 0.012*"dynamic" + 0.012*"image" + 0.012*"objects" + 0.012*"odometry" + 0.012*"time"'),
 (5,
  '0.023*"learning" + 0.023*"nonri

## Topic Modeling - Tentative 2

Dans cette deuxième tentative, on utilisera le modèle sur des données plus significatives.
Il s'agit ici de ne considérer que les noms et les adjectifs dans les titres des publications.

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

from nltk import word_tokenize, pos_tag

In [33]:
#cette fonction retourne un texte ne contenant que des noms et des adjectifs

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    
    #La fonction is_noun_adj vérifie que les 2 premiers caractères du type de mot (pos) sont
    #soit 'NN' pour les noms, soit 'JJ' pour les adjectifs
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    
    #On récupèrera une liste de mots ici : Uniquement les mots qui ont été tagués 
    #comme étant des noms ou des adjectifs par la fct is_noun_adj
    #pos_tag(tokenized) permet de créer une liste de couples (mot, type_de_mot)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)]

    #On retourne un texte contenant tous ces mots 
    return ' '.join(nouns_adj)

In [41]:
data_clean = pd.read_pickle('data_clean.pkl')

In [39]:
#J'applique la fonction pour ne filtrer que les noms et les adjectifs dans les titres
data_clean_nouns_adj = pd.DataFrame(data_clean.article_title.apply(nouns_adj))
data_clean_nouns_adj

Unnamed: 0_level_0,article_title
id_publication,Unnamed: 1_level_1
conf/3dim/AbarghoueiB19,question multitask approach completion monocul...
conf/3dim/AbbeloosCCTD17,object discovery single rgbd images multiple o...
conf/3dim/AbrevayaWB18,spatiotemporal modeling efficient registration...
conf/3dim/AdanR19,reconstruction asis semantic models unorganise...
conf/3dim/AhmedMG17,point cloud registration virtual interest poin...
...,...
conf/3dim/KarakottasZSAGZ19,° surface regression hypersphere loss
conf/3dim/KasperH19,multiple point light estimation lowquality rec...
conf/3dim/KaushikL19,undispnet multistage monocular depth prediction
conf/3dim/KimMQ18,multiplanar monocular reconstruction manhattan...


In [49]:
# Création du document-term matrix avec que les noms et les adjetifs
from sklearn.feature_extraction.text import CountVectorizer

cv_nouns_adj = CountVectorizer(stop_words='english')
data_cv_nouns_adj = cv_nouns_adj.fit_transform(data_clean_nouns_adj.article_title)
data_dtm_nouns_adj = pd.DataFrame(data_cv_nouns_adj.toarray(), columns=cv_nouns_adj.get_feature_names())
data_dtm_nouns_adj.index = data_clean_nouns_adj.index
data_dtm_nouns_adj


Unnamed: 0_level_0,accurate,ace,acquisition,active,adaptive,adjustment,adversarial,alignment,analysis,animation,...,videos,view,viewpoints,views,virtual,vision,visual,weak,wild,world
id_publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
conf/3dim/AbarghoueiB19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AbbeloosCCTD17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AbrevayaWB18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AdanR19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/AhmedMG17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
conf/3dim/KarakottasZSAGZ19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KasperH19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KaushikL19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conf/3dim/KimMQ18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
#gensim corpus
corpus_nouns_adj = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtm_nouns_adj.transpose()))

#vocabulary dictionary
id2word_nouns_adj = dict((v, k) for k, v in cv_nouns_adj.vocabulary_.items())

'world'

In [54]:

ldana = models.LdaModel(corpus=corpus_nouns_adj, num_topics=20, id2word=id2word_nouns_adj, passes=300)
ldana.print_topics()

[(0,
  '0.039*"object" + 0.039*"single" + 0.020*"images" + 0.020*"multiple" + 0.020*"rgbd" + 0.020*"instances" + 0.020*"discovery" + 0.020*"feature" + 0.020*"synthetic" + 0.020*"evaluation"'),
 (1,
  '0.042*"phaseshift" + 0.042*"systems" + 0.042*"compensation" + 0.042*"structuredlight" + 0.042*"motion" + 0.042*"totalvariation" + 0.042*"framework" + 0.002*"realtime" + 0.002*"realistic" + 0.002*"query"'),
 (2,
  '0.061*"stereo" + 0.031*"underwater" + 0.031*"removal" + 0.031*"technique" + 0.031*"multiscale" + 0.031*"cnn" + 0.031*"pattern" + 0.031*"active" + 0.031*"cameras" + 0.031*"multiple"'),
 (3,
  '0.047*"depth" + 0.047*"estimation" + 0.047*"shape" + 0.047*"synthesis" + 0.047*"structureaware" + 0.024*"completion" + 0.024*"question" + 0.024*"approach" + 0.024*"multitask" + 0.024*"monocular"'),
 (4,
  '0.042*"unsupervised" + 0.042*"probability" + 0.042*"succinct" + 0.042*"learning" + 0.042*"sips" + 0.042*"inlierness" + 0.042*"points" + 0.002*"range" + 0.002*"radioactive" + 0.002*"realis