In [74]:
# pip install spacy
# python -m spacy download it_core_news_sm

import os
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy
import math

### Utils

In [121]:
stop_words = set(stopwords.words('italian')) #remove stop words
lemmatizer = spacy.load('it_core_news_sm')

def pre_processing(document):
    document = re.sub(r'[^\w\s]',' ',document) #remove punctuation
    document = document.lower()
    document = lemmatizer(document)
    document = [token.lemma_ for token in document]
    document = [w for w in document if not w in stop_words]
    return document

#inverse document frequency
def idf(word, corpus):
    return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i.testo]))

#term frequency
def tf(word, document):
    return document.count(word)/len(document)

In [113]:
class Documento:
    # ============================================
    # Constructor
    # ============================================
    def __init__(self, titolo, classe, testo):
        self.titolo = titolo
        self.classe = classe
        self.testo = testo
        self.term_vector = []

### Vector space model

In [126]:
path = "data\docs_200"
documents = []
classi = set()

for file_name in os.listdir(path):
    if os.path.isfile(os.path.join(path, file_name)):
        file = open("data/docs_200/" + file_name, "r", encoding="utf-8")
        classe = file_name.split("_")[0]
        sentence = file.read().replace("\n", " ").replace("\"", "")

        document = Documento(file_name, classe,pre_processing(sentence) )
        classi.add(file_name.split("_")[0])

        documents.append(document)        

In [127]:
for doc in documents:
    weights = {}
    for word in set(doc.testo):
        weights.update({word: tf(word, doc.testo) * idf(word, documents)})

    doc.term_vector.append(weights)


In [128]:
documents[0].term_vector

[{'pericolo': 0.01179181264952249,
  'mettere': 0.004872525698801003,
  'litro': 0.003930604216507496,
  'ricerca': 0.006749796026254024,
  'mammifere': 0.013903504505522545,
  'ermafroditismo': 0.013903504505522545,
  'trovare': 0.0019866384265190148,
  '70': 0.00406884436925878,
  'università': 0.005914264053958078,
  'su il': 0.0025409146405981253,
  'adottare': 0.004600842130756307,
  'analizzare': 0.004398585968730285,
  'cetaceo': 0.02085525675828382,
  'pacifico': 0.006042296072507553,
  'causa': 0.0030884483223901876,
  'aggiungere': 0.0023249277299750033,
  ' ': 7.932047737025063e-05,
  'biologa': 0.006042296072507553,
  'ingurgitare': 0.006042296072507553,
  'studio': 0.0019283146948108974,
  'inquinamento': 0.00968012079352243,
  'simile': 0.0036913859505025874,
  'potere': 0.001362435252395471,
  'fondale': 0.006042296072507553,
  '  ': 0.0008964006703363705,
  'proporre': 0.006797212910019939,
  ',': 0.0,
  'sacchetto': 0.006042296072507553,
  'di il': 0.000184614125783388

### Metodo di Rocchio

In [101]:
# estraggo il dizionario

dizionario = []

for i in documents.values():
    dizionario += i

dizionario = set(dizionario)

{'spettacoli', 'scie_tecnologia', 'salute', 'politica', 'cinema', 'economia_finanza', 'ambiente', 'cucina', 'motori', 'sport'}


In [103]:
for key in documents:
    print(key, ': ', len(documents[key]))
    

ambiente_01.txt :  331
ambiente_02.txt :  268
ambiente_03.txt :  403
ambiente_04.txt :  263
ambiente_05.txt :  410
ambiente_06.txt :  405
ambiente_07.txt :  489
ambiente_08.txt :  621
ambiente_09.txt :  519
ambiente_10.txt :  735
ambiente_11.txt :  168
ambiente_12.txt :  310
ambiente_13.txt :  215
ambiente_14.txt :  282
ambiente_15.txt :  1335
ambiente_16.txt :  1152
ambiente_17.txt :  401
ambiente_18.txt :  448
ambiente_19.txt :  643
ambiente_20.txt :  119
cinema_01.txt :  647
cinema_02.txt :  964
cinema_03.txt :  637
cinema_04.txt :  431
cinema_05.txt :  788
cinema_06.txt :  508
cinema_07.txt :  859
cinema_08.txt :  437
cinema_09.txt :  360
cinema_10.txt :  443
cinema_11.txt :  170
cinema_12.txt :  618
cinema_13.txt :  117
cinema_14.txt :  635
cinema_15.txt :  412
cinema_16.txt :  280
cinema_17.txt :  294
cinema_18.txt :  524
cinema_19.txt :  94
cinema_20.txt :  472
cucina_01.txt :  378
cucina_02.txt :  368
cucina_03.txt :  358
cucina_04.txt :  500
cucina_05.txt :  380
cucina_06.txt 

In [None]:
def get_near_positive( )

In [None]:
beta = 16
gamma = 4

conteggi_classi = {}

for classe in classi:
    classi[classe] = np.zeros(len(dizionario))
    npos = get_near_positive(classe)

    for doc in documents:
        if doc.classe == classe:
            for word in doc.testo:
                if word not in documents_vectors[classe]:
                    documents_vectors[classe].update({word: 0})
                else:
                    documents_vectors[classe][word] += 1

