# Laboratorio 4

Si richiede un'implementazione di un sistema di text segmentation, prendendo ispirazione da TextTiling. In particolare, partendo da un corpus composto da almeno 3 sezioni su tematiche molto diverse (ad es. potete usare paragrafi da tre pagine di Wikipedia diverse), dovrete testare il vostro sistema in modo che riesca ad individuare le giuste linee di taglio (o quasi).

In [186]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt


In [187]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clear_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in tokens if token not in string.punctuation] #tolgo la punteggiatura
    tokens = [token.lower() for token in tokens] # sostituisco le maiuscole con le minuscole
    tokens = [token for token in tokens if token not in stop_words] # rimuovo le stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatizzo
    
    return tokens

In [188]:
# calcolo per ogni frase il numero di parole

def calcola_occorrenze_parole_frasi(sentences):
    words_per_sentence = {}

    for i, sentence in enumerate(sentences):
        tokens = clear_sentence(sentence)
        words_per_sentence[i] = {}
        for token in tokens:
            if token not in words_per_sentence[i]:
                words_per_sentence[i][token] = 1
            else:
                words_per_sentence[i][token] += 1
    
    return words_per_sentence

In [189]:
#dividi words_per_sentence in k parti uguali
def divide_in_k_parts(words_per_sentence, k=3):
    parts = {}
    num_sentences = len(words_per_sentence)
    sentence_per_part = num_sentences // k
    break_points = [sentence_per_part*i for i in range(1,k)]
    break_points = list(np.insert(break_points, 0, 0))

    j = 0
    num_parte = 0
    parts[num_parte] = {}

    for i, sentence in enumerate(words_per_sentence):
        if j < sentence_per_part or num_parte == k-1:
            parts[num_parte][i] = words_per_sentence[sentence]
            j += 1
        else:
            j = 0
            num_parte += 1
            if num_parte not in parts:
                parts[num_parte] = {}
            parts[num_parte][i] = words_per_sentence[sentence]
            j += 1
    
    return parts, break_points

def divide_in_parts(words_per_sentence, break_points=[0,4,10]):
    parts = {}
    num_sentences = len(words_per_sentence)
    num_parte = 1
    parts[0] = {}

    j = 0
    for i, sentence in enumerate(words_per_sentence):
        if num_parte > len(break_points)-1:
            parts[num_parte-1][i] = words_per_sentence[sentence]
            j += 1
        else:
            if j < break_points[num_parte]:
                parts[num_parte-1][i] = words_per_sentence[sentence]
                j += 1
            else:
                num_parte += 1
                if num_parte-1 not in parts:
                    parts[num_parte-1] = {}
                parts[num_parte-1][i] = words_per_sentence[sentence]
                j += 1

    return parts

sentences = pd.read_csv('corpus.csv')['sentence']

words_per_sentence = calcola_occorrenze_parole_frasi(sentences)
parts, break_points = divide_in_k_parts(words_per_sentence, k=3)

In [190]:
#per ogni parte calcolo l'overlap lessicale nelle frasi a due a due
def compute_cohesion(parts):
    print('\n')
    print('COMPUTE COHESION')
    words_count = []
    j = 0

    for index, part in enumerate(parts):
        words_count_per_block = {}
        i = list(parts[index].keys())[0]

        for sentence in parts[index]:
            if i+1 in parts[index]:
                words_count_per_block[j] = 0

                for word in parts[index][i]:
                    if word in parts[index][i+1]:
                        words_count_per_block[j] += parts[index][i][word] + parts[index][i+1][word]

                j += 1
                i += 1
        words_count.append(words_count_per_block)

    print(words_count)

    #plot words_count
    #for index, words_count_per_block in enumerate(words_count):
    #    plt.plot(words_count_per_block.keys(), words_count_per_block.values(), label=f'parte {index+1}')

    #plt.legend()
    #plt.show()

    return words_count

In [191]:
def fix_break_points(parts, words_count):
    new_parts = parts.copy()

    for i, counts in enumerate(words_count):
        min_value = min(counts.values())
        min_value_position = [k for k, v in counts.items() if v == min_value][0]

        keys_to_update = []

        for j, part in enumerate(parts[i]):
            if j > min_value_position:
                keys_to_update.append(j)

        for key in keys_to_update:
            new_parts[i+1][key] = new_parts[i][key]
            del new_parts[i][key]
    
    return new_parts



In [192]:
def sum_word_count(words_count):
    for count in words_count:
        print(sum(count.values()))

### MAIN

In [193]:
def recalculate_break_points(words_count, break_points):
    #print('\n')
    print('RECALCULATE BREAK POINTS')
    print('words count', words_count)
    print('initial break points', break_points)

    start_sentences = {k: words_count[k] for k in list(words_count)[:break_points[1]]}
    medium_sentences = {k: words_count[k] for k in list(words_count)[break_points[1]:break_points[2]]}
    end_sentences = {k: words_count[k] for k in list(words_count)[break_points[2]:]}

    start_min_value_positions = [k for k, v in start_sentences.items() if v == min(start_sentences.values())]
    medium_min_value_positions = [k for k, v in medium_sentences.items() if v == min(medium_sentences.values())]
    end_min_value_positions = [k for k, v in end_sentences.items() if v == min(end_sentences.values())]

    #1
    start_pos = start_min_value_positions[len(start_min_value_positions)-1]
    if (medium_min_value_positions[0] != break_points[1]):
        medium_pos = medium_min_value_positions[0]
    else:
        if len(medium_min_value_positions) > 1:
            medium_pos = medium_min_value_positions[1]
        else:
            #trova il secondo minimo
            medium_pos = [k for k, v in medium_sentences.items() if v == sorted(medium_sentences.values())[1]][0]


    if (break_points[1] - start_pos) > (medium_pos - break_points[1]):
        break_points[1] = medium_pos
    else:
        break_points[1] = start_pos

    print('#1 -',start_pos, medium_pos)

    #2
    medium_pos = medium_min_value_positions[len(medium_min_value_positions)-1]
    if (end_min_value_positions[0] != break_points[1]):
        end_pos = end_min_value_positions[0]
    else:
        if len(end_min_value_positions) > 1:
            end_pos = end_min_value_positions[1]
        else:
            #trova il secondo minimo
            end_pos = [k for k, v in end_sentences.items() if v == sorted(end_sentences.values())[1]][0]


    if (break_points[2] - medium_pos) > (end_pos - break_points[2]):
        break_points[2] = end_pos
    else:
        break_points[2] = medium_pos

    print('#2 -',start_pos, medium_pos)
    print('final break points', break_points)

    print(start_min_value_positions, medium_min_value_positions, end_min_value_positions)
    #print('\n')

    return break_points
    

In [195]:
sentences = pd.read_csv('corpus.csv')['sentence']

words_per_sentence = calcola_occorrenze_parole_frasi(sentences)
parts, break_points = divide_in_k_parts(words_per_sentence, k=3)
print('break_points', break_points)

for i in range(5):
    print('parts', parts)
    cohoesion_parts = compute_cohesion(parts)
    sum_word_count(cohoesion_parts)

    merged_cohoesion_parts = {}
    for cohoesion_part in cohoesion_parts:
        merged_cohoesion_parts.update(cohoesion_part)

    #ricalcolo i break points
    break_points = recalculate_break_points(merged_cohoesion_parts, break_points)
    #ricalcolo le parti in base ai nuovi break points
    parts = divide_in_parts(words_per_sentence, break_points)
    print('break_points', break_points)



break_points [0, 7, 14]
parts {0: {0: {'juventus': 2, 'founded': 1, 'sport-club': 1, 'late': 1, '1897': 1, 'pupil': 1, 'massimo': 1, "d'azeglio": 1, 'lyceum': 1, 'school': 1, 'turin': 1, 'among': 1, 'eugenio': 1, 'canfari': 2, 'enrico': 1}, 1: {'renamed': 1, 'foot-ball': 1, 'club': 1, 'juventus': 1, 'two': 1, 'year': 1, 'later': 1}, 2: {'club': 1, 'joined': 1, '1900': 1, 'italian': 1, 'football': 1, 'championship': 1}, 3: {'juventus': 1, 'played': 1, 'first': 1, 'italian': 1, 'football': 1, 'championship': 1, 'match': 1, '11': 1, 'march': 1, '1900': 1, '1–0': 1, 'defeat': 1, 'torinese': 1}, 4: {'1904': 1, 'businessman': 1, 'marco': 1, 'ajmone-marsan': 1, 'revived': 1, 'finance': 1, 'juventus': 1, 'making': 1, 'possible': 1, 'transfer': 1, 'training': 1, 'field': 1, 'piazza': 1, "d'armi": 1, 'appropriate': 1, 'velodrome': 1, 'umberto': 1}, 5: {'period': 1, 'team': 1, 'wore': 1, 'pink': 1, 'black': 1, 'kit': 1}, 6: {'juventus': 1, 'first': 1, '1905': 1, 'italian': 1, 'football': 1, 'cham