In [148]:
#import needed libraries
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('postgresql://read_only_user@datathon.data-lab.io:5432/postgres')
import numpy as np
from sklearn import preprocessing

import nltk
nltk.download('stopwords')

In [149]:
#select dat from provided postgres databse
sql = "select description from public.standard where cast(pubdate as date) between '2018-01-01' and '2018-04-20' and length(description)>0;"
data = pd.read_sql_query(sql,con=engine)
data.head()

Unnamed: 0,description
0,"Sieg an Norweger Schmid, Rehrl Siebenter, Grub..."
1,Zunächst keine Berichte über Schäden oder Opfer
2,Neuer Spot für den Cactus
3,Sechster Sieg für Toronto in Folge
4,Polizei: Gruppe plante größeren Sprengstoffans...


In [151]:
#tokenize the articles into list of words
#with stopwords we are removing useless general words
from nltk.tokenize.casual import casual_tokenize
tokenizer = lambda text: casual_tokenize(text, preserve_case=False)
LabeledSentence = gensim.models.doc2vec.TaggedDocument
content = []
j=0

from nltk.corpus import stopwords
stopWords = set(stopwords.words('german'))




tokens_orig = data['description'].apply(tokenizer)
tokens_orig = list(tokens)

#final clean-up of the tokens
tokens = []
for token_orig in tokens_orig:
    token = []
    for word in token_orig:
        #if (word not in stopWords and ['.' , '!', ',', '?', '"', ':', '-', ' ', '|'] not in word):
        if (word not in stopWords and bool(re.match('^[A-z]+$', word))):
            token.append(word)
    tokens.append(token)
            
            
#prepare labeled sentences dataset do doc2vec alg
for token in tokens:
    content.append(LabeledSentence(token,[j]))
    j+=1

In [164]:
#train shallow neural network doc2vec and compute senteces vectors for further analysis 
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

d2v_model = gensim.models.Doc2Vec(content, size = 200, window = 10, min_count = 50, workers=7, dm = 1, 
                alpha=0.025, min_alpha=0.001)

2018-04-21 13:04:34,152 : INFO : collecting all words and their counts
2018-04-21 13:04:34,153 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-04-21 13:04:34,203 : INFO : PROGRESS: at example #10000, processed 62317 words (1273710/s), 22205 word types, 10000 tags
2018-04-21 13:04:34,251 : INFO : collected 35437 word types and 20000 unique tags from a corpus of 20000 examples and 125206 words
2018-04-21 13:04:34,251 : INFO : Loading a fresh vocabulary
2018-04-21 13:04:34,271 : INFO : min_count=50 retains 238 unique words (0% of original 35437, drops 35199)
2018-04-21 13:04:34,272 : INFO : min_count=50 leaves 26575 word corpus (21% of original 125206, drops 98631)
2018-04-21 13:04:34,273 : INFO : deleting the raw counts dictionary of 35437 items
2018-04-21 13:04:34,274 : INFO : sample=0.001 downsamples 150 most-common words
2018-04-21 13:04:34,275 : INFO : downsampling leaves estimated 18009 word corpus (67.8% of prior 26575)
2018-04-21 13:04:34,277 :

In [168]:
# shows the similar articles to article with id = 100
print (d2v_model.docvecs.most_similar(100, topn = 10))
print(tokens[100])

[(13414, 0.7639433145523071), (3411, 0.7537654638290405), (10956, 0.7512814402580261), (13793, 0.7507666349411011), (11712, 0.748617947101593), (10428, 0.748516321182251), (17499, 0.7484363913536072), (1447, 0.7472888231277466), (3237, 0.7460149526596069), (11026, 0.7457857131958008)]
['geldpolitische', 'experiment', 'japan', 'europa', 'sorgen', 'wichtige', 'lehren', 'aufzeigen']


In [None]:
#cluster vectors into specific groups

NUM_CLUSTERS = 15

vectors = []

for token in tokens:
    vectors.append(d2v_model.infer_vector(token))
print("INFO: infer_vectors done.")

kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True)

print("INFO: KMeansClusterer done.")
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
print("INFO: assign_clusters done.")

In [162]:
#Print the most frequent words in each clusters
from collections import Counter

clusters = assigned_clusters

for cluster_iter in list(set(clusters)):
    all_words = []
    for i in range(0,len(tokens)):
        sentence = tokens[i]
        cluster = clusters[i]
        if cluster == cluster_iter:
            all_words = all_words+sentence
    counts = Counter(all_words)
    print("Cluster: " + str(cluster_iter))
    print(counts.most_common(100))
    print("\n")

Cluster: 0
[('krone', 4), ('politische', 4), ('neue', 4), ('serbien', 4), ('entwickelt', 3), ('kritisieren', 3), ('seien', 3), ('windows', 3), ('regisseur', 3), ('gestorben', 3), ('frankreichs', 2), ('sucht', 2), ('hoch', 2), ('situation', 2), ('fest', 2), ('verabschiedet', 2), ('krise', 2), ('zuvor', 2), ('android', 2), ('remis', 2), ('nutzungsrechte', 2), ('hollande', 2), ('aufs', 2), ('automatisch', 2), ('technik', 2), ('gleich', 2), ('erscheinen', 2), ('spielt', 2), ('kurz', 2), ('tiroler', 2), ('spur', 2), ('moore', 2), ('zeitung', 2), ('initiative', 2), ('erneut', 2), ('ermittlungen', 2), ('fokus', 2), ('mehrheit', 2), ('sollen', 2), ('kehrt', 2), ('besten', 2), ('drucker', 2), ('demonstration', 2), ('wochenende', 2), ('besser', 2), ('norden', 2), ('kritisierte', 2), ('angaben', 2), ('gewalt', 2), ('franziskus', 2), ('spitze', 2), ('kenia', 2), ('partner', 2), ('thriller', 2), ('kraft', 2), ('starken', 2), ('fallen', 2), ('entstehung', 2), ('finale', 2), ('klug', 2), ('kosovo', 2

Cluster: 10
[('mehr', 9), ('legt', 6), ('belasten', 5), ('schwache', 5), ('ehemaligen', 5), ('tat', 5), ('spezial', 5), ('nimmt', 5), ('schauspieler', 5), ('untersucht', 5), ('stelle', 5), ('erhielt', 5), ('zweifel', 4), ('hilfe', 4), ('erfolgreich', 4), ('gefordert', 4), ('schweden', 4), ('vorgestellt', 4), ('gewinnt', 4), ('ab', 4), ('konnte', 4), ('zwei', 4), ('tablets', 4), ('angebot', 4), ('zuvor', 4), ('barbara', 4), ('missbrauch', 4), ('tod', 4), ('feiern', 4), ('zahlreiche', 4), ('berichtet', 4), ('folgt', 4), ('neuem', 4), ('protest', 4), ('finanzielle', 3), ('red', 3), ('connected', 3), ('doppelzimmer', 3), ('weiterer', 3), ('android', 3), ('geehrt', 3), ('region', 3), ('ziehen', 3), ('lebt', 3), ('chef', 3), ('wirkung', 3), ('vergeben', 3), ('sprechen', 3), ('fokus', 3), ('st', 3), ('abgesagt', 3), ('treffen', 3), ('freiheit', 3), ('breite', 3), ('verletzte', 3), ('sorgen', 3), ('kanzler', 3), ('vorerst', 3), ('angeblich', 3), ('elf', 3), ('sorgt', 3), ('premier', 3), ('spra