# Create 3 test-corpus
They will represent 3 different 'kinds of trend analysis': 
    first one will be a set of similar documents (by similar I mean documents sharing a main topic)
    second one will be a set containing 2/3 clearly defined groups of documents 
    (to check whether or not a model picks up the differences and clusters them correctly)
    third one will be a set of random documents.

In [1]:
# load the json files we have at our disposal
import json
import random 

filenames = ['blockchain.json', 'industria_4.0.json', 'trend_analisys.json', 'dump_solr.json']

docs = []
for i, filename in enumerate(filenames):
    with open(filename, 'r') as outfile:
        json_data = json.load(outfile)
        
    if i==2:
        # trend_analisys has a different format
        docs = docs + json_data
    else: 
        ## let's now retrieve the meaningful part of the json document
        # response{}--->docs[] 
        docs = docs + json_data['response']['docs']
   
    print("Number of documents: ",len(docs))
        

Number of documents:  96
Number of documents:  604
Number of documents:  897
Number of documents:  2274


## Clean Dataset
Remove duplicates and such

In [2]:
## many documents have a failed abstract, let's remove them
to_check = ' Questo sito web utilizza cookie tecnici e, previo Suo consenso, cookie di profilazione,'
docs = [doc for i, doc in enumerate(docs) if not(to_check.strip() in doc['abstract'][0].strip())]

print("New length after removing docs: ", len(docs))
# remove duplicates

index_to_remove = []
duplicates = 0
for i, doc in enumerate(docs):
    try:
        index = docs.index(doc, i+1, len(docs)-1)
        index_to_remove.append(index)
        duplicates = duplicates + 1
    except:
        None
docs = [doc for i, doc in enumerate(docs) if not(i in index_to_remove) ]

docs = [doc for doc in docs
            if not("Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T" in doc['title'])]

print("Number of duplicates: ", duplicates, "\nNew lenght:", len(docs))

## Adjust data format: title, abstract and url came in as list, but they're more useful as strings
for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract', 'url']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]

New length after removing docs:  2082
Number of duplicates:  86 
New lenght: 1946


## Document sets of type 1

In [36]:
path = 'TestDocs/similar_documents_' # add json afterwards

def save(name, file):
    with open(path + name + '.json', 'w') as out:
        json.dump(file, out)

blockchain_docs = docs[:60]
print([doc['title'] for doc in blockchain_docs[:60:10]]) # start through not past end, by step

save('blockchain', blockchain_docs)

facebook_docs = [doc for doc in docs if 'facebook'in doc['title'].lower()]
print(len(facebook_docs))
print([doc['title'] for doc in facebook_docs[: :10]]) # start through not past end, by step
save('facebook', facebook_docs)

industry_docs = [doc for doc in docs[100:300] if '4.0' in doc['title']]
print([doc['title'] for doc in industry_docs[: :10]], len(industry_docs)) # start through not past end, by step
save('industry4.0', industry_docs)


['Arriva il primo "Smartphone" Blockchain - Blockchain 4innovation', 'Abbanoa punta sulla blockchain per certificare la lettura dei contatori - Blockchain 4innovation', 'Sarà la blockchain a "salvare" le privatizzazioni? - Blockchain 4innovation', 'Blockchain & Bitcoin: una guida per capire e per orientarsi dedicata ai lettori di MilanoFinanza - Blockchain 4innovation', 'PA 4.0 e blockchain - Blockchain 4innovation', 'SpidChain, identità digitale 4.0 per PA e aziende - Blockchain 4innovation']
59
['Facebook, ex moderatrice denuncia: “Filtriamo contenuti scioccanti, siamo traumatizzati" - Repubblica.it', 'Facebook: ex moderatrice fa causa, traumatizzata da immagini - Hi-tech - ANSA.it', 'Facebook Cambridge Analytica, coinvolti 214 mila italiani - Corriere.it', 'Social e bambini: YouTube assume nuovi moderatori e Facebook lancia Messenger Kids - Corriere.it', 'Facebook Dating: ecco come funziona l’anti-Tinder di Zuckerberg - Corriere.it', 'Facebook contro le armi stampate in 3D']
['Innov

## Document sets of type 2 (2 clearly different arguments)


In [50]:
path = 'TestDocs/2_different_' # add json afterwards

def save(name, file):
    with open(path + name + '.json', 'w') as out:
        json.dump(file, out)

# facebook + space-related
diff_docs = []
diff_docs += facebook_docs[:50]
diff_docs += [doc for doc in docs for word in ['marte', 'nasa', 'astronomia', 'pianeta', 'giove', 'spazio']
              if word in doc['title'].lower() and 
              not('facebook' in doc['abstract'].lower())]
print([doc['title'] for doc in diff_docs[: :10]], len(diff_docs))

# shuffle it first
random.shuffle(diff_docs)
save('fb-spazio', diff_docs)



['Facebook, ex moderatrice denuncia: “Filtriamo contenuti scioccanti, siamo traumatizzati" - Repubblica.it', 'Facebook: ex moderatrice fa causa, traumatizzata da immagini - Hi-tech - ANSA.it', 'Facebook Cambridge Analytica, coinvolti 214 mila italiani - Corriere.it', 'Social e bambini: YouTube assume nuovi moderatori e Facebook lancia Messenger Kids - Corriere.it', 'Facebook Dating: ecco come funziona l’anti-Tinder di Zuckerberg - Corriere.it', 'IoT e Smart Manufacturing trovano spazio alla BI-MU', 'Ufo, Alieni, extraterrestri, Marte, Nasa, Tetricus, Pianeta rosso', 'La più grande luna di Saturno sotto una tempesta di polvere - Spazio & Astronomia - ANSA.it', 'Sapevi che queste tecnologie di tutti i giorni esistono grazie alla Nasa? - Wired'] 86


In [51]:
# blockchain + microsoft
blockchain = [doc for doc in docs if 'blockchain' in doc['title']]

microsoft = [doc for doc in docs if 'microsoft' in doc['title'].lower() and 
             not('blockchain' in doc['abstract'].lower())]
print([doc['title'] for doc in microsoft[: :10]], len(microsoft))
save('blockchain-microsoft', random.shuffle(microsoft+blockchain))

['Microsoft avanti tutta sull’Iot: investimenti per 5 miliardi di dollari - CorCom', 'Microsoft, ecco Surface 6 Pro e Surface Laptop 2', 'Microsoft porta Windows 10 nei robot', 'Microsoft: novità, aggiornamenti, innovazioni'] 32


In [54]:
## Document sets of type 3 (mixed arguments)
path = 'TestDocs/mixed_docs_' # add json afterwards

def save(name, file):
    with open(path + name + '.json', 'w') as out:
        json.dump(file, out)
# leave-out blockchain and industria  4.0 this time
ddocs = [doc for doc in docs if not('blockchain' in doc['title'].lower() or '4.0' in doc['title'])]
random.shuffle(ddocs)
mix1 = ddocs[:70]
print([doc['title'] for doc in mix1[: :10]], len(mix1))
save('mix1', mix1)

random.shuffle(ddocs)
mix2 = ddocs[:70]
save('mix2', mix2)

random.shuffle(ddocs)
mix3 = ddocs[:70]
save('mix3', mix3)

['Fake news con finti articoli di Repubblica: non credeteci e denunciate - Repubblica.it', 'Hyperloop TT, presentata la prima capsula del treno a levitazione magnetica - Corriere.it', "PC desktop, meno di 100 milioni all'anno", 'Canon: novità, recensioni e prezzi', 'Tutto quello che devi sapere sui taser in Italia - Wired', '«Ricordati che devi morire», un’app ce lo dice 5 volte al giorno - Corriere.it', "LG c'è uno smartwatch al fianco del V40"] 70


# Triple of doc test
e.g. (facebook, facebook, space) can the model recognize the similarities and differences?

In [71]:

space = [doc for doc in docs for word in ['marte', 'nasa', 'astronomia', 'pianeta', 'giove', 'spazio']
              if word in doc['title'].lower() and 
              not('facebook' in doc['abstract'].lower())]

doc_triple = [(facebook_docs[i],facebook_docs[i+1], space[i]) for i in range(min(len(facebook_docs), len(space)))]
print([(a['title'],b['title'], c['title']) for (a, b, c) in doc_triple[:2] ])

apple = [doc for doc in docs for word in ['apple', 'iPhone', 'iPod', 'iPad', 'tim cook', 'steve jobs']
              if word in doc['title'].lower() ]
len(apple)
football = [doc for doc in docs for word in ['calcio', 'champions league', 'juventus']
              if word in doc['title'].lower() ]
len(football)
doc_triple2 = [(apple[i],apple[i+1], football[i]) for i in range(min(len(football), len(apple)))]

doc_triple3 = [(space[i],space[i+1], football[i]) for i in range(min(len(football), len(space)))]

doc_triple4 = [(apple[i],apple[i+1], space[i]) for i in range(min(len(apple), len(space)))]


doc_triple += doc_triple2 + doc_triple3 + doc_triple4
len(doc_triple)
print(doc_triple[0])

random.shuffle(doc_triple)
path = 'TestDocs/3-docs-test/test.json'
# save it after shuffling the docs 
with open(path, 'w') as out:
    json.dump(doc_triple, out)

[('Facebook, ex moderatrice denuncia: “Filtriamo contenuti scioccanti, siamo traumatizzati" - Repubblica.it', 'Facebook, i cofondatori di Instagram lasciano la società - Repubblica.it', 'IoT e Smart Manufacturing trovano spazio alla BI-MU'), ('Facebook, i cofondatori di Instagram lasciano la società - Repubblica.it', 'Facebook, terremoto in Instagram: lasciano i due co-fondatori della app dopo scontro con Zuckerberg - Il Sole 24 ORE', 'Ufo, Alieni, Extraterrestri, Marte, Pianeta rosso, Nasa, Tetricus, Barsoom, Edgar Rice Burroughs')]
({'fonte_dati': 'trend_analisys', 'id': '1-http://www.repubblica.it/tecnologia/social-network/2018/09/25/news/facebook_un_ex_moderatrice_denuncia_il_social_network_filtriamo_contenuti_scioccanti_siamo_psicologicamente_traumatizzat-207322420/?rss', 'ta_id': 1, 'title': 'Facebook, ex moderatrice denuncia: “Filtriamo contenuti scioccanti, siamo traumatizzati" - Repubblica.it', 'abstract': 'UNA BUONA parte di contenuti spazzatura pubblicati su Facebook non arr

# Create pre-clustered docs

In [13]:
clusters = []
clusters.append([doc for doc in docs if 'marte' in doc['title'].lower()][4:])
clusters.append([doc for doc in docs if 'Tinder'.lower() in doc['title'].lower() and 
            'facebook'in doc['title'].lower()])
clusters.append([doc for doc in docs if 'elon musk' in doc['title'].lower() and 'tesla' in doc['title'].lower()])
clusters.append([doc for doc in docs if 'equinozio' in doc['title'].lower()])
clusters.append([doc for doc in docs if 'iphone xs' in doc['title'].lower() and 
           not('Note 9' in doc['title']) and not('Airpower' in doc['title']) and
           not('dentro' in doc['title'])
          and not('anteprima' in doc['title'])
           and not('Speciale' in doc['title'])
      ])

clusters.append([doc for doc in docs if 'fifa 19' in doc['title'].lower()])
clusters.append([doc for doc in docs if 'youtube' in doc['title'].lower()
           and 'bambini' in doc['title'].lower()])
clusters.append([doc for doc in docs if 'SpaceX'.lower() in doc['title'].lower() and
           'luna'in doc['title'].lower()])
clusters.append([doc for doc in docs if 'samsung' in doc['title'].lower()
      and not('4.0' in doc['title'])][:20:2])
clusters.append([doc for doc in docs if 'huawei' in doc['title'].lower() 
                 and not('Google' in doc['title']) and not('Wired Next' in doc['title'])][1:])

#print([doc ['title'] for cluster in clusters for doc in cluster])
#print([doc['title'] for doc in docs if 'marte' in doc['title'].lower()][5::2])
#print([doc['title'] for doc in docs if 'blockchain' in doc['title'].lower()
#      and ('smart' in doc['title'])])
print(len([doc for cluster in clusters for doc in cluster]))
filename = 'EvaluateModels/pre-clustered_docs_harder.json'
import json
with open(filename, 'w') as out:
    json.dump(clusters, out)

54


In [10]:
#print([doc['title'] for cluster in clusters for doc in cluster])
print([doc['title'] for doc in docs if 'iphone xs' in doc['title'].lower() and 
           not('Note 9' in doc['title']) and not('Airpower' in doc['title']) and
           not('dentro' in doc['title'])
          and not('anteprima' in doc['title'])
           and not('Speciale' in doc['title'])
      ])

["iPhone Xs Max tira 3-4 volte più dell'Xs - Hi-tech - ANSA.it", 'Apple conferma "per errore" iPhone XS, XS Max e XR', 'Aspettando iPhone Xs: il giorno di iOS 12', 'iPhone XS: perché Apple ha nascosto il notch?', 'Apple iPhone Xs e Xs Max: è troppo caro?', 'Apple lancia iPhone Xs e la versione Max: sempre più grandi, gli smartphone sono la nuova Tv - Corriere.it', 'Problemi per iPhone XS e XS Max: «Non si caricano se il cavo è collegato mentre lo schermo è spento» - Corriere.it', 'iPhone XS e XS Max: novità, scheda tecnica e prezzo', 'iPhone XS appiana le rughe, protestano gli utenti', 'iPhone XS: proteste per la carica troppo lenta']


## 2 major clusters set of docs

In [16]:
# cluster-like format
newdocs = []
print([doc['title'] for doc in docs if 'blockchain' in doc['title'].lower()][1:16])
print([doc['title'] for doc in docs if 'whatsapp' in doc['title'].lower()])

newdocs.append([doc for doc in docs if 'blockchain' in doc['title'].lower()][1:16])
newdocs.append([doc for doc in docs if 'whatsapp' in doc['title'].lower()])
    
# blockchain + whatsapp with slight noise
with open('EvaluateModels/blockchain_whatsapp_noise.json','w') as file:
    json.dump(newdocs, file)

["200 nuovi laureati per IBM: l'innovazione ha bisogno anche della Blockchain - Blockchain 4innovation", 'Arriva SIAChain la piattaforma Blockchain di SIA - Blockchain 4innovation', 'L’anticipo fatture in banca si gestisce con la Blockchain: zero errori e costi ridotti - Blockchain 4innovation', 'L’accordo Hong Kong-Singapore per l’utilizzo della Blockchain - Blockchain 4innovation', "B2Lab lancia Chainsquare: l'e-commerce scopre la blockchain - Blockchain 4innovation", 'Deloitte-Dnv GL, nasce la certificazione blockchain: “È solo l’inizio” - Blockchain 4innovation', 'Arriva Notarchain: la Blockchain tutta italiana - Blockchain 4innovation', 'Smart Contract e blockchain - Pagina 4 di 5 - Blockchain 4innovation', 'Quanto si risparmia usando gli smart contract nelle banche e nelle assicurazioni - Blockchain 4innovation', 'Abbanoa punta sulla blockchain per certificare la lettura dei contatori - Blockchain 4innovation', 'Smart Contract e blockchain - Blockchain 4innovation', 'Bocconi e Fe

# Get articles from Guardians API

In [158]:
# it only shows a few articles per-page; we need to get more pages by changing the page parameter in the query
import requests
import json
# get all articles published today that talk about a topic
#url = ("https://content.guardianapis.com/search?from-date=2018-11-05&show-fields=headline%2CbodyText&q=football&api-key="+apy_key)

# let's use python simplified way of using REST services https://gist.github.com/dannguyen/c9cb220093ee4c12b840
MY_API_KEY = "67d68b19-8fdd-4a08-a8de-b2d891f81f86"
API_ENDPOINT = 'http://content.guardianapis.com/search'
query = 'Juventus'
my_params = {
    'q': query,
    'from-date': "2018-1-03", # today
    'order-by': "relevance",
    'show-fields': 'headline,bodyText',
    'page-size': 200,
    'api-key': MY_API_KEY
}

response = requests.get(API_ENDPOINT, my_params)

# get the article array
jsonresponse = response.json()
jsonresponse = jsonresponse['response']['results']
jsonresponse = [d['fields'] for d in jsonresponse]
print(len(jsonresponse))
print([r['headline'] for r in jsonresponse][:20])
#print(jsonresponse)
#print([r['title'] for r in jsonresponse if 'Pyongyang' in r['title']])

200
['Manchester United 0-1 Juventus: Champions League – as it happened', 'Manchester United 0-1 Juventus: player ratings from Old Trafford', 'Juventus in daunting form as Ronaldo turns provider against Napoli', 'José Mourinho says Juventus’ spending can win them Champions League', 'Football transfer rumours: Sergej Milinkovic-Savic to Juventus?', 'Football transfer rumours: Chelsea to sign Daniele Rugani from Juventus?', 'Cristiano Ronaldo for €100m at 33? Indulgence or Juventus masterstroke?', 'Cristiano Ronaldo joining Juventus in €100m deal from Real Madrid', 'José Mourinho: Juventus are at ‘different level of quality and stability’', 'Football transfer rumours: Juventus and PSG in for Rhian Brewster?', 'Football transfer rumours: Cristiano Ronaldo to join Juventus for £88m?', 'Juventus confident of signing Cristiano Ronaldo from Real Madrid', 'Football transfer rumours: Ronaldo in talks with Juventus and Facebook?', 'Football transfer rumours: Malcom to Arsenal or Spurs? Mata to J

## Creating pre-clustered document
Same as above, I will create a pre-clustered document-set, by choosing a few topic and inserting news articles related to this topic.

In [162]:
clusters = []
queries = ["\"Mars\" AND Nasa", "Vaccination", "Nort Korea AND Nuclear", "Juventus"]
for q in queries:
    my_params['q'] = q
    my_params['from-date'] = "2018-1-03"
    response = requests.get(API_ENDPOINT, my_params)

    # get the article array
    jsonresponse = response.json()
    jsonresponse = jsonresponse['response']['results']
    jsonresponse = [d['fields'] for d in jsonresponse]
    print(len(jsonresponse))
    print([r['headline'] for r in jsonresponse][:20])
    clusters.append(jsonresponse[:20])
with open("EvaluateModels/english_4_clusters.json", "w") as out:
    json.dump(clusters, out, indent=4)
print("File Saved")

74
['The lunar gateway: a shortcut to Mars?', 'Nasa Mars rover finds organic matter in ancient lake bed', 'Nasa mission to map Mars interior will launch this weekend', 'Scientists call for ‘mega-mission’ to find ancient life on Mars', 'The case against Mars colonisation', 'Don’t let bacteria-laden humans contaminate Mars', 'Spacewatch: Can Mars rover beat the dust to trundle on again?', 'First woman: Smithsonian Air and Space director looks from the moon to Mars', 'New moon mission will not distract from effort to reach Mars, Nasa boss says', "Planet of the apis: Nasa develops plan to launch 'Marsbees'", 'The PhD researcher looking for water on Mars – and inspiring girls to reach for the stars', 'Mars to track blood moon in double celestial treat on Friday', 'Spacewatch: Martian rocks on Earth a step nearer as UK builds red planet rover', 'The first human on Mars should be a woman – we deserve stardust too', 'Mars: huge underground lake raises prospects of life on planet, astronomers s