In [1]:
#importing required libraries
from gensim import corpora #pip install gensim ou pip install -U gensim
from collections import defaultdict
from gensim import similarities
from gensim import models
import json #pip install json
import re
import string
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop #pip install spacy / try pip install stop-words
import mysql.connector #pip install mysql-connector-python-rf

In [2]:
#reading json file
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

In [3]:
#writing json file
def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [4]:
def database(): #Read Database
    db=mysql.connector.connect(host='192.185.4.44',
                                           database='veillise_test2',
                                           user='veillise_veille2',
                                           password='@@@@@@@@@@@@@@')
    if db.is_connected():
            db_Info = db.get_server_info()
            print("Connected to MySQL Server version ", db_Info)
            cursor = db.cursor()
            cursor.execute("select database();")
            record = cursor.fetchone()
            print("You're connected to database: ", record)
    

    mycursor=db.cursor()
    query = "select id, titre, article, group_nb,lang from articles where lang='French' AND (group_nb = \"\" OR group_nb IS NULL) AND (created_at > DATE_ADD(NOW(), INTERVAL -36 HOUR));"

    cursor.execute(query)
    result = cursor.fetchall()
    if cursor.rowcount != 0 :
        print("No. of articles not analyzed ", cursor.rowcount)
    data = []
    for row in result:
        article = {}
        article["id"] = row[0]
        article["titre"]=row[1]
        article["lang"]=row[4]
        article["group_nb"]=row[3]
        article["article"]=row[2]
        data.append(article)
        
    print(len(data))
    return data

In [5]:
regex = re.compile(r'<[^>]+>')
pattern = r'[' + string.punctuation + ']'
def Clean_Data(txt):
    txt = txt.lower() #Lower text
    txt = re.sub(regex,'', txt) #Remove Html syntaxe
    txt = re.sub(r'http\S+', '', txt) #Remove URLs
    txt = re.sub(pattern,'', txt)
    return txt

In [6]:
def Cleaned_data(data): #List of List
    for i in range(len(data)):
        (data[i])["titre"] = Clean_Data((data[i])["titre"])
        (data[i])["article"] =  Clean_Data((data[i])["titre"]) + Clean_Data((data[i])["article"])
        
    return data #List of List 

In [7]:
#Treat data 
def Treat_Data(data,string): #List of List 
    #Creating a list of stopwords
    #Removing stopwords ( Depends on BD too )
    stp = ["finalement", "autres", "h", "été", "est-ce", "qu’on", "vraiment" ,"c’est" ,"fera","tags","Tags"]
    stoplist= list(fr_stop)+ stp
    txts = [[word for word in document[string].lower().split() if word not in stoplist]for document in data]
    #Calculating frequency of each word 
    frequency = defaultdict(int)
    for text in txts:
        for token in text:
            frequency[token] += 1
    #Removing words that appear only once
    txts = [[token for token in text if frequency[token] > 1]for text in txts]
    #Creating a dictionary
    gensim_dictionary = corpora.Dictionary(txts)
    #Vectorizing the corpus
    gensim_corpus = [gensim_dictionary.doc2bow(text) for text in txts]
    return gensim_corpus,gensim_dictionary

In [8]:
def LSI(gensim_corpus,gensim_dictionary,numtopics): #creating LSI model
    lsi = models.LsiModel(gensim_corpus, id2word=gensim_dictionary, num_topics=numtopics)
    return lsi

In [9]:
'''-----Treating the query-------
    Query is the document that will be compared to 
    Doc is an article '''
def Treat_Query(doc):
    #Creating bow vector
    vec_bow = gensim_dictionary.doc2bow(doc.lower().split())
    #Converting the query to LSI space
    vec_lsi = lsi[vec_bow]  
    return vec_lsi
    

In [10]:
def Similarity(vec_lsi,gensim_corpus,lsi): #Testing similarity
    index = similarities.MatrixSimilarity(lsi[gensim_corpus])  
    simil = index[vec_lsi]  
    simil=sorted(list(enumerate(simil)),key=lambda item: -item[1])
    return simil


In [11]:
def Match_Sim(data,simil,nbsim,acc): #Match similar articles with a special id
    for doc_position, doc_score in simil:
        if doc_score>acc:
            (data[doc_position])["group_nb"] = nbsim 
    return data

In [12]:
data=database() #Read DataBase
write_data("./DataFINAL.json",data)
nbsim = 0 #id same articles
docs = load_data("./DataFINAL.json") 
docs = Cleaned_data(docs)
gensim_corpus,gensim_dictionary = Treat_Data(docs,"titre")
lsi = LSI(gensim_corpus,gensim_dictionary,3)
for i in range(len(docs)):
    if (docs[i])["group_nb"] == 0: 
        vec_lsi = Treat_Query((docs[i])["titre"])
        simil = Similarity(vec_lsi,gensim_corpus,lsi)
        if simil[i][1]==0.0:
            docs[i]["group_nb"]="NULL"
        else:
            nbsim += 1
            docs = Match_Sim(docs,simil,nbsim,0.99)

Connected to MySQL Server version  5.7.23-23
You're connected to database:  ('veillise_test2',)
No. of articles not analyzed  264
264


In [13]:
score = [] #Just to test the algorithm
for i in range(len(docs)):
    score.append([docs[i]['group_nb'],docs[i]['id']])
print(score)

[[1, 339234], [2, 339233], ['NULL', 339231], [3, 339232], [4, 339230], [22, 339228], [5, 339229], [5, 339226], [6, 339227], [38, 339224], [12, 339225], [22, 339223], [7, 339222], [7, 339221], [1, 339219], [5, 339218], ['NULL', 339209], ['NULL', 339210], ['NULL', 339211], ['NULL', 339212], ['NULL', 339213], ['NULL', 339214], ['NULL', 339215], [22, 339216], [22, 339207], [7, 339206], [8, 339194], [22, 339195], [7, 339365], [5, 339366], [22, 339364], [5, 339359], [22, 339356], [2, 339357], [9, 339358], [22, 339355], [22, 339352], [4, 339353], [10, 339351], [9, 339348], [11, 339347], [12, 339343], [22, 339341], [22, 339342], [5, 339339], [13, 339337], [14, 339336], [5, 339334], [11, 339335], [15, 339317], [22, 339318], [4, 339321], [10, 339333], [5, 339332], [24, 339331], [11, 339327], [13, 339326], [7, 339325], [22, 339323], [16, 339309], [22, 339310], [38, 339311], [17, 339312], [18, 339313], [8, 339314], [19, 339315], [20, 339316], [21, 339303], [22, 339304], [22, 339305], ['NULL', 3393

In [14]:
for i in range(len(docs)):
    print((docs[i])['titre'] , ',', (docs[i])['group_nb'])
    print()


conflit en ukraine quatre régions vont être intégrées à la russie
 , 1


en jouant les médiateurs dans le conflit en ukraine larabie saoudite cherche à revenir au devant de la scène
 , 2


laura dantan retrouvée
 , NULL


energie atomique lalgérie dévoile sa stratégie
 , 3


l’ua l’oci et la ligue arabe soutiennent la candidature de l’algérie au conseil de sécurité la symphonie algérienne séduit
 , 4


tizi ouzou huawei met en place une académie dexcellence
 , 22


des festivités du 1er novembre bien particulières 30000 logements seront distribués
 , 5


disponibilité du médicament aoun rencontre les pharmaciens dofficine
 , 5


révision et allègement du programme scolaire les propositions des spécialistes
 , 6


journée mondiale du tourisme célébration a minima à béjaïa
 , 38


concrétisation du partenariat amorcé entre les deux pays fin août benabderrahmane et borne à loeuvre
 , 12


tourisme comment attirer les étrangers
 , 22


officiellement invité au sommet arabe dalger que fera

In [15]:
d=mysql.connector.connect(host='192.185.4.44',
                                           database='veillise_test2',
                                           user='veillise_veille2',
                                           password='veille2@2020@')

mycursor = d.cursor()
query2 = "select id, titre, article, group_nb,lang from articles where lang='French' AND (created_at > DATE_ADD(NOW(), INTERVAL -36 HOUR));"
mycursor.execute(query2)
myresult = mycursor.fetchall()
for x in myresult:
    
    print(x[0], ':', x[1],'grp nb: ', x[3])
    print()

339234 : 
Conflit en Ukraine Quatre régions vont être intégrées à la Russie
 grp nb:  0

339233 : 
En jouant les médiateurs dans le conflit en Ukraine L'Arabie saoudite cherche à revenir au- devant de la scène
 grp nb:  0

339231 : 
L'aura d'antan retrouvée
 grp nb:  0

339232 : 
Energie atomique L'Algérie dévoile sa stratégie
 grp nb:  0

339230 : 
L’UA, L’OCI et La ligue arabe soutiennent la candidature de l’Algérie au conseil de sécurité La symphonie algérienne séduit
 grp nb:  0

339228 : 
Tizi Ouzou Huawei met en place une Académie d'excellence
 grp nb:  0

339229 : 
Des festivités du 1er novembre bien particulières 30000 logements seront distribués
 grp nb:  0

339226 : 
Disponibilité du médicament Aoun rencontre les pharmaciens d'officine
 grp nb:  0

339227 : 
Révision et allègement du programme scolaire Les propositions des spécialistes
 grp nb:  0

339224 : 
Journée mondiale du tourisme Célébration a minima à Béjaïa
 grp nb:  0

339225 : 
Concrétisation du partenariat amorcé 