## Transfer learning on CNN

In [34]:
import pandas as pd 
from torch.utils.data import Dataset
import torch
from collections import Counter
import string
import numpy as np
class NewsDataset(Dataset):

    def __init__(self, news_df, vectorizer):
        self.news_df = news_df
        self._vectorizer = vectorizer

        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, news_df.title)) + 2

        self.train_df = self.news_df[self.news_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.news_df[self.news_df.split == 'val']
        self.validation_size = len(self.val_df)

        self.test_df = self.news_df[self.news_df.split == 'test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val': (self.val_df, self.validation_size),
            'test': (self.test_df, self.test_size)
        }

        self.set_split('train')

        class_counts = news_df.category.value_counts().to_dict()

        def sort_key(item):
            return self._vectorizer.category_vocab.lookup_token(item[0])

        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)

    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        news_df = pd.read_csv(news_csv)
        train_names_df = news_df[news_df.split == "train"]
        return cls(news_df, NewsVectorizer.from_dataframe(train_names_df))

         
    @classmethod   
    def load_dataset_and_make_vectorizer(cls,news_csv):
        """ Load dataset and make a new vectorizer from scratch
        Args:
            news_csv (str): location of the dataset
        Returns:
            an instance of NewsDataset
        """
        news_df = pd.read_csv(news_csv)
        train_names_df = news_df[news_df.split=="train"]
        return cls(news_df, NewsVectorizer.from_dataframe(train_names_df))
    def __len__(self):
        return self._target_size
    def __getitem__(self,index):
        """The primary entry point method for PyTorch datasets
        Args:
            index (int): the index tp the data point
        Returns:
            a dict holding the data point's features (x_data) and label (y_target)
            """
        row = self._target_df.iloc[index]
        title_vector= self._vectorizer.vectorize(row.title, self._max_seq_length)
        category_index = self._vectorizer.category_vocab.lookup_token(row.category)
        return {"x_data": title_vector,
                "y_target": category_index
                    }
    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size


In [35]:
class Vocabulary(object):
    """ Clase a procesar texto y extrar el vocabulario para mappear"""
    def __init__(self,token_to_idx=None,mask_token="<MASK>",add_unk=True,unk_token="<UNK>"):
        """Args:
            token_to_idx (dict): un mapa pre existen de toknes a indices
            add_unk(bool): un bandera que indica si se añade el token UNK de desconocido
            unk_token(str): el token UNK se añade a el vocabulario
            """
        if token_to_idx is None:
            token_to_idx={}
        self._token_to_idx=token_to_idx
        self._idx_to_token={idx: token
                           for token,idx in self._token_to_idx.items()}
        self._add_unk=add_unk
        self._unk_token=unk_token
        self._mask_token=mask_token
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index=-1
        if add_unk:
            self.unk_index=self.add_token(unk_token)
    def to_serializable(self):
        """ Retorna un diccionario que puede ser serializado """
        return {"token_to_idx":self._token_to_idx,
               "add_unk":self._add_unk,
               "unk_token":self._unk_token,
               "mask_token":self._mask_token}
    @classmethod
    def from_serializable(cls,contents):
        """instancia el bocabulario desde un diccionario serializado"""
        return cls(**contents)
    def add_token(self,token):
        """
        Actualiza los mapeos de diccionarrios basados en los tokens
        Args:
            token (str): el item a añadir en el vocabulario
        Returns:
            index(int) : el entero correspondiente al token
        """
        if token in self._token_to_idx:
            index=self._token_to_idx[token]
        else:
            index=len(self._token_to_idx)
            self._token_to_idx[token]=index
            self._idx_to_token[index]=token
        return index
    def lookup_token(self,token):
        """ Obtiene el indice asociado con el tokenn de UNK token si el token no está presente,
        Args:
            token (str): el token a observar
        returns :
            index (int) el indices correspondiente al token
        Notes:
               `unk_index` necesita ser entero positivo (habiendo sido añadido al vocabulario) para la funcionalidad desconocidad
               """
        if self._add_unk:
            return self._token_to_idx.get(token,self.unk_index)
        else:
            return self._token_to_idx[token]
    def lookup_index(self,index):
        """ Obtiene el token asociado al indice
        Args:
            index (int): el indice a observar
        returns :
            token (str) el token correspondiendte al indices
        Raises:
            KeyError: si el indice no está en el vocabulario
        """
        if index not in self._idx_to_token:
            raise KeyError("El indice (%d)no está en el vocabulario" % index)
        return self._idx_to_token[index]
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    def __len__(self):#Refleja la longitudad del vocabulario
        return len(self._token_to_idx)

In [36]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [37]:
class NewsVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""    
    def __init__(self, title_vocab, category_vocab):
        self.title_vocab = title_vocab
        self.category_vocab = category_vocab

    def vectorize(self, title, vector_length=-1):
        """
        Args:
            title (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        Returns:
            the vetorized title (numpy.array)
        """
        indices = [self.title_vocab.begin_seq_index]
        indices.extend(self.title_vocab.lookup_token(token) 
                       for token in title.split(" "))
        indices.append(self.title_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.title_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, news_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
            cutoff (int): frequency threshold for including in Vocabulary 
        Returns:
            an instance of the NewsVectorizer
        """
        category_vocab = Vocabulary()        
        for category in sorted(set(news_df.category)):
            category_vocab.add_token(category)

        word_counts = Counter()
        for title in news_df.title:
            for token in title.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        title_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                title_vocab.add_token(word)
        
        return cls(title_vocab, category_vocab)

    @classmethod
    def from_serializable(cls, contents):
        title_vocab = \
            SequenceVocabulary.from_serializable(contents['title_vocab'])
        category_vocab =  \
            Vocabulary.from_serializable(contents['category_vocab'])

        return cls(title_vocab=title_vocab, category_vocab=category_vocab)

    def to_serializable(self):
        return {'title_vocab': self.title_vocab.to_serializable(),
                'category_vocab': self.category_vocab.to_serializable()}

In [42]:
def load_glove_from_file(glove_filepath):
    """Load the Glove embeddings

    Args:
    glove_filepath (str): áth to the glove embedding fike
    Returns:
        word_to_index (dict): embeddings 
    """
    word_to_index={}
    embeddings=[]
    with open(glove_filepath,"r",encoding="utf-8") as fp:
        for index,line in enumerate(fp):
            line=line.split(" ")
            word_to_index[line[0]] = index
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)
def make_embedding_matrix(glove_filepath,words):
    """ Create embedding matrix for a specigic set of words.
    Args:
        glove_filepath (str): file path to the glove embeddings
        words( list) : list of words in the dataset
    Returns:
        final_embeddings (numpy.ndarray): embedding matrix
    """
    word_to_idx ,glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size= glove_embeddings.shape[1]
    final_embeddings= np.zeros((len(words),embedding_size))
    for i,word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i,:] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1,embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i,:]=embedding_i
    return final_embeddings

In [55]:
import torch.nn as nn
import torch.nn.functional as F

class NewsClassifier(nn.Module):
    def __init__(self,embedding_size,num_embeddings,num_channels,hidden_dim, num_classes,dropout_p,
                pretrained_embeddings=None, padding_idx=0):
        """
        Args:
            embedding_size (int): size of the embedding vectors
            num_embeddings (int): number of embedding vectors
            filter_width (int): width of the convolutional kernels
            num_channels (int): number of convolutional kernels per layer
            hidden_dim (int): the size of the hidden dimension
            num_classes (int): the number of classes in classification
            dropout_p (float): a dropout parameter 
            pretrained_embeddings (numpy.array): previously trained word embeddings
                default is None. If provided, 
            padding_idx (int): an index representing a null position
        """
        super(NewsClassifier,self).__init__()
        if pretrained_embeddings is None:
            self.emb=nn.Embedding(embedding_dim=embedding_size,num_embeddings=num_embeddings,padding_idx=padding_idx)
        else:
            pretrained_embeddings=torch.from_numpy(pretrained_embeddings).float()
            self.emb=nn.Embedding(embedding_dim=embedding_size,num_embeddings=num_embeddings,padding_idx=padding_idx,
                                  _weight=pretrained_embeddings)
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size,
                      out_channels=num_channels,
                      kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels,kernel_size=3,stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,out_channels=num_channels, kernel_size=3,stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,out_channels=num_channels,kernel_size=3),
            nn.ELU()
        )
        self.training=True
        self._dropout_p =dropout_p
        self.fc1 = nn.Linear(num_channels,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,num_classes)
    def forward(self,x_in, apply_softmax=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, dataset._max_seq_length)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, num_classes)
        """
        #Embed and permute so features are channels
        x_embedded =self.emb(x_in).permute(0,2,1)
        features= self.convnet(x_embedded)
        remaining_size=features.size(dim=2)
        features= F.avg_pool1d(features,remaining_size).squeeze(dim=2)
        features= F.dropout(features,p=self._dropout_p,training=self.training)
        ##Final linear layer to produc classification outputs

        intermediate_vector=F.relu(F.dropout(self.fc1(features),
                                             p=self._dropout_p,training=self.training))
        prediction_vector= self.fc2(intermediate_vector)
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector,dim=1)
        return prediction_vector

In [56]:
from argparse import Namespace
args = Namespace(
    # Data and Path hyper parameters
    news_csv="data/ag_news/news_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch5/document_classification",
    # Model hyper parameters
    glove_filepath='data/wiki_giga_2024_300_MFT20_vectors_seed_2024_alpha_0.75_eta_0.05_combined.txt', 
    use_glove=True,
    embedding_size=300, 
    hidden_dim=100, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

In [57]:
#El conjunto de datos instanciado
import torch.optim as optim
dataset=NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
vectorizer= dataset.get_vectorizer()
if args.use_glove:
    words = vectorizer.title_vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained embeddings")
classifier = NewsClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=embeddings.shape[0],
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.category_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)
if not torch.cuda.is_available():
    args.cuda=False
args.device= torch.device("cuda" if args.cuda else "cpu")
classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)


Using pre-trained embeddings


In [61]:

loss_func=nn.CrossEntropyLoss()
optimizer=optim.Adam(classifier.parameters(),lr=args.learning_rate)
def make_train_state(args):
    return { "epoch_index":0,
             "train_loss": [],
             "train_acc":[],
             "val_loss":[],
             "val_acc":[],
             "test_loss":-1,
             "test_acc":-1}
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
def generate_batches(dataset,batch_size,shuffle=True,
                     drop_last=True,device="cpu"):
    """Una funcion generation la que wrapea ek dataloader de pytorxh. SE asegurarar de que cada tesnor este en un unico dispositivo
    """
    dataloader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [62]:
from torch.utils.data import DataLoader
train_state=make_train_state(args)
for epoch_index in range(args.num_epochs):
    train_state['epoch_index']=epoch_index
    # Itera sobre nuestro dataset
    #Configura: el generador de batch, la perdidad, el accuracy a 0 y el conjunto de entrenamiento a activo
    dataset.set_split("train")
    batch_generator=generate_batches(dataset,batch_size=args.batch_size,device=args.device)
    running_loss=0.0
    running_acc=0.0
    classifier.train()
    for batch_index,batch_dict in enumerate(batch_generator):
        
        # La rutina de entrenamiento se conforma de 5 pasos

        #paso 1, los gradientes en cero
        optimizer.zero_grad()
        #Paso . calcula la salida
        y_pred=classifier(batch_dict["x_data"])
        #paso 3, calcula la perdidad
        loss=loss_func(y_pred,batch_dict["y_target"])
        loss_batch=loss.to("cpu").item()
        running_loss+=(loss_batch-running_loss)/(batch_index+1)
        #paso 4, usa la perdidad para producir el gradiente
        loss.backward()
        #Paso 5, usa el otpimizar para que tomo el paso gradiente
        optimizer.step()

        ##Calcula el accuracy
        acc_batch=compute_accuracy(y_pred,batch_dict["y_target"])
        running_acc+=(acc_batch-running_acc)/(batch_index+1)
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)

    #Iterar sobre el dataset de validacion
    #Configura, el generator de batch, la perdiddad, el accuracy a 0 y modo eval
    dataset.set_split("val")
    batch_generator=generate_batches(dataset,batch_size=args.batch_size,device=args.device)
    running_loss=0.0
    running_acc=0.0
    classifier.eval()
    #Empieza la evaluacion
    for batch_index,batch_dict in enumerate(batch_generator):
        # Paso 1: calcula la salida
        y_pred=classifier(batch_dict["x_data"])
        # paso 2, calcula la salida
        loss = loss_func(y_pred,batch_dict['y_target'])
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss)/(batch_index+1)
        #paso 3, calcula el accuracy
        acc_batch= compute_accuracy(y_pred,batch_dict['y_target'])
        running_acc+=(acc_batch-running_acc)/(batch_index+1)
    train_state["val_loss"].append(running_loss)
    train_state["val_acc"].append(running_acc)

In [66]:
dataset.set_split("test")
batch_generator=generate_batches(dataset,batch_size=args.batch_size,device=args.device)
running_loss=0.
running_acc=0.
classifier.eval()
print("comienza la evaluacion")
for batch_index,batch_dict in enumerate(batch_generator):
    print("batch: ",batch_index)
    #Compute the output
    y_pred=classifier(batch_dict["x_data"])
    # calcula la perdidad
    loss = loss_func(y_pred,batch_dict["y_target"])
    loss_batch= loss.item()
    running_loss += (loss_batch-running_loss)/(batch_index+1)
    #Calcula el accuracy 
    acc_batch= compute_accuracy(y_pred,batch_dict["y_target"])
    running_acc+=(acc_batch-running_acc)/(batch_index+1)
train_state["test_loss"]= running_loss
train_state["test_acc"] = running_acc

comienza la evaluacion
batch:  0
batch:  1
batch:  2
batch:  3
batch:  4
batch:  5
batch:  6
batch:  7
batch:  8
batch:  9
batch:  10
batch:  11
batch:  12
batch:  13
batch:  14
batch:  15
batch:  16
batch:  17
batch:  18
batch:  19
batch:  20
batch:  21
batch:  22
batch:  23
batch:  24
batch:  25
batch:  26
batch:  27
batch:  28
batch:  29
batch:  30
batch:  31
batch:  32
batch:  33
batch:  34
batch:  35
batch:  36
batch:  37
batch:  38
batch:  39
batch:  40
batch:  41
batch:  42
batch:  43
batch:  44
batch:  45
batch:  46
batch:  47
batch:  48
batch:  49
batch:  50
batch:  51
batch:  52
batch:  53
batch:  54
batch:  55
batch:  56
batch:  57
batch:  58
batch:  59
batch:  60
batch:  61
batch:  62
batch:  63
batch:  64
batch:  65
batch:  66
batch:  67
batch:  68
batch:  69
batch:  70
batch:  71
batch:  72
batch:  73
batch:  74
batch:  75
batch:  76
batch:  77
batch:  78
batch:  79
batch:  80
batch:  81
batch:  82
batch:  83
batch:  84
batch:  85
batch:  86
batch:  87
batch:  88
batch:  

In [67]:
print(train_state["train_acc"])
print(train_state["val_acc"])
print(train_state["test_acc"])

[69.99213986280489, 80.69979039634144, 81.80259146341469, 82.51119474085367, 83.16025152439023, 84.0867949695122, 84.83112614329264, 85.86961699695121, 86.68302210365864, 87.42854420731705, 88.32412347560985, 89.0625, 89.87828696646349, 90.45588795731709, 91.07636242378057, 91.65158155487798, 92.15891768292676, 92.49118711890245, 92.97946836890245, 93.38200266768287, 93.69402629573175, 93.99771341463409, 94.2168445121951, 94.62771532012196, 94.63962461890249, 94.93973894817081, 95.15053353658534, 95.37204649390245, 95.3815739329269, 95.49947599085361, 95.78291730182931, 95.80673589939025, 95.96155678353662, 96.07945884146338, 96.25095274390249, 96.27119855182934, 96.39029153963413, 96.42363757621932, 96.43197408536575, 96.66420541158546, 96.70350609756099, 96.71303353658539, 96.7737709603659, 96.87857278963415, 96.8654725609756, 96.99171112804882, 96.98694740853648, 97.06078506097569, 97.00600228658527, 97.11318597560971, 97.15129573170735, 97.15605945121942, 97.25967035060985, 97.2251

In [68]:
import re
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
def predict_category(title,classifier,vectorizer,max_length):
    """Predits a news category for a new title
    Args:
        title(str): a raw title string
        classifier (NewsClassifier): an instane of the trained classifier
        vectorizer(NewsVectorizer): the corresponding vectorizer
        max_length(int): the max sequence length
        """
    title = preprocess_text(title)
    vectorized_title = \
        torch.tensor(vectorizer.vectorize(title, vector_length=max_length))
    result = classifier(vectorized_title.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_category = vectorizer.category_vocab.lookup_index(indices.item())

    return {'category': predicted_category, 
            'probability': probability_values.item()}

In [69]:
def get_samples():
    samples = {}
    for cat in dataset.val_df.category.unique():
        samples[cat] = dataset.val_df.title[dataset.val_df.category==cat].tolist()[:5]
    return samples

val_samples = get_samples()
#title = input("Enter a news title to classify: ")
classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_category(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        print("Prediction: {} (p={:0.2f})".format(prediction['category'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))
    print("-"*30 + "\n")

True Category: Business
Prediction: World (p=0.86)
	 + Sample: AZ suspends marketing of cancer drug
Prediction: Sports (p=0.79)
	 + Sample: Business world has mixed reaction to Perez move
Prediction: Sports (p=1.00)
	 + Sample: Betting Against Bombay
Prediction: Sci/Tech (p=1.00)
	 + Sample: Malpractice Insurers Face a Tough Market
Prediction: Sports (p=0.69)
	 + Sample: NVIDIA Is Vindicated
------------------------------

True Category: Sci/Tech
Prediction: Sci/Tech (p=1.00)
	 + Sample: Spies prize webcam #39;s eyes
Prediction: Sci/Tech (p=1.00)
	 + Sample: Sober worm causes headaches
Prediction: Business (p=1.00)
	 + Sample: Local Search: Missing Pieces Falling into Place
Prediction: Sci/Tech (p=1.00)
	 + Sample: Hackers baiting Internet users with Beckham pix
Prediction: Sports (p=1.00)
	 + Sample: Nokia adds BlackBerry support to Series 80 handsets
------------------------------

True Category: Sports
Prediction: Sports (p=0.79)
	 + Sample: Is Meyer the man to get Irish up?
Predict