# Text segmentation using Hidden Markov Models

In [77]:
import os
import glob
import numpy as np

In [78]:
ROOT = os.path.abspath('.')

PERL_DIR = os.path.join(ROOT,'PerlScriptAndModel')
RES_DIR = os.path.join(ROOT,'res')

### Coding/Decoding Mails

In [79]:
DATA_DIR = os.path.join(ROOT,'dat')

# Iterate through files and load the text 
def files_iter(data_dir, with_name=False):
    files = glob.glob('{}/*.dat'.format((data_dir)))
    if with_name:
        for f in files:
            # Get the filename 
            name = f.split("\\")[-1].split('.')[0]
            # Return filename and associated text
            yield name, np.loadtxt(f, dtype=int)
    else:
        for f in files :
            yield np.loadtxt(f, dtype = int)

In [80]:
# And we get a generator that will allow us to iterate through the mails
mail_iter = files_iter(DATA_DIR, with_name=True)

### Distribution files

In [81]:
# Writing a function to get the probability data
def get_emission_prob(perl_dir):
    return np.loadtxt(os.path.join(perl_dir,'P.text'))

In [82]:
# Inputs to the Viterbi function
trans = np.array([[0.999218078035812,0.000781921964187974],[0,1]])
emission_prob = get_emission_prob(PERL_DIR)
start_prob = np.array([1,0])

### To implement:

In [83]:
# Viterbi function
def viterbi(obs, start_prob, trans, emission_prob):
    """
        Viterbi Algorithm Implementation

        Keyword arguments:
            - obs: sequence of observation
            - states:list of states
            - start_prob:vector of the initial probabilities
            - trans: transition matrix
            - emission_prob: emission probability matrix
        Returns:
            - seq: sequence of state
    """

    # Avoid underflow: use the logarithm !
    # Avoid 0 in logarithm: use a small constant !
    small = 10e-10
    
    start_prob = np.log(start_prob+small)
    trans = np.log(trans+small)
    emission_prob = np.log(emission_prob+small)
    
    T = len(obs) # Number of observations
    N = trans.shape[0] # Number of model states
    
    # Initialisation
    log_l = np.zeros((T,2))
    bcktr = np.zeros((T,2))
    
    # Viterbi
    
    # Forward loop:
    log_l[0,:]= emission_prob[obs[0],:]+start_prob
    for t in range(1,T):
        for j in range(N):
            log_l[t, j] = emission_prob[obs[t],j]+np.max(trans[:,j]+log_l[t-1,:])
            bcktr[t, j] = np.argmax(trans[:,j]+log_l[t-1,:])
    # Backward loop
    path = np.zeros(T,dtype=int)
    path[-1] = np.argmax(log_l[T-1,:]) 
    for i in range(T-1,0,-1):
        path[i - 1] = bcktr[i,path[i]]

    return path

In [84]:
# Creating a directory to put the result of the viterbi function
if not os.path.exists(RES_DIR):
    os.mkdir(RES_DIR)
    
# Function that will write a viterbi path for a mail in a dedicated result file
def create_viterbi_path_file(mail_name, viterbi_path):
    with open('{}/{}_path.txt'.format(RES_DIR, mail_name), 'w') as f: 
        f.write(''.join([str(c) for c in viterbi_path]))   

In [85]:
# Using our generator, we get the mail names and data
for name_file, data in mail_iter:
    # Find out the viterbi path using viterbi
    viterbi_path = viterbi(data,start_prob,trans,emission_prob)
    # Put it in the result file
    create_viterbi_path_file(name_file, viterbi_path)

### Visualizing segmentation

In [86]:
# Writing a function to go into the directory and execute the perl script "segment.pl" on the mail in the given path
def exec_perl_script(mail, path):
    res = !cd {PERL_DIR}; perl segment.pl {mail} {path}
    return res

# Writing a function getting the original mail, the result of viterbi, and applying the segmentation script
# Then putting the result
def segment_mail(mail_name, data_dir, output_dir):
    # Get the full path of the mail
    mail = os.path.join(data_dir,mail_name+'.txt').replace('\\','/')
    # Get the full path of the result
    path = os.path.join(output_dir, mail_name+'_path.txt').replace('\\','/')
    # Execute the visualization script
    print(mail)
    print(path)
    formatted_mail = exec_perl_script(mail,path)
    print(formatted_mail)
    # Get the results
    """
    formatted_mail_text = np.loadtxt(formatted_mail)
    print(formatted_mail_text)
    
    # Go through the resulting text until the cutting line
    ...
    # If this was not the last line, return the text cut in to parts: header and body
    ...
    # If not, it's just a header
    ..."""

In [87]:
with open('d:/TP_TSIA/SD-TSIA214/TP2/res/mail1_path.txt','r') as f:
    print(f.read())

0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

In [52]:
# Getting mails names
mails= []
for name_file, data in files_iter(DATA_DIR, with_name=True):
    mails.append(name_file)
# Call the function and look at the result of segmentation
segment_mail(mails[0],DATA_DIR,RES_DIR)

d:/TP_TSIA/SD-TSIA214/TP2/dat/mail1.txt
d:/TP_TSIA/SD-TSIA214/TP2/res/mail1_path.txt
['La syntaxe du nom de fichier, de r�pertoire ou de volume est incorrecte.']
