# Preprocessing experiments

In [1]:
# Import code from file in upper directory
import sys, os
sys.path.append(os.getcwd() + os.sep + os.pardir)
from tweet_data import TweetsBaseDataset, TweetsBOWDataset

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

In this notebook we will examine the effect of preprocessing on model performance, by training a classifier with data processed by different preprocessing functions. In each experiment the preprocessing function changes, and the F1 scores on the training and dev sets are reported.

In [2]:
def run_experiment(process_fn):
    def score_model(model, dataset):
        predictions = model.predict(dataset.data)
        return f1_score(dataset.labels, predictions, average='macro')
        
    # Overwrite processing function
    TweetsBaseDataset.process_tweet = process_fn
    
    # Load and preprocess datasets with new processing function
    train_set = TweetsBOWDataset('../data/train', 'us_train')
    dev_set = TweetsBOWDataset('../data/dev', 'us_trial', vocabulary=train_set.vocabulary)
    
    # Train and score model on train and dev sets
    model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
    model.fit(train_set.data, train_set.labels)
    
    train_score = score_model(model, train_set)
    dev_score = score_model(model, dev_set)
    
    return train_score, dev_score

Let's get first the results on the default processing function:

In [3]:
run_experiment(TweetsBaseDataset.process_tweet)

Reading file
Read file with 489609 tweets
Building vocabulary
Loading labels
Loading counts matrix
Creating TF-ID matrix
Reading file
Read file with 50000 tweets
Using vocabulary containing 10002 tokens
Loading labels
Loading counts matrix
Creating TF-ID matrix


(0.2768806185593461, 0.23972266714875984)

## Case normalization

In [4]:
import nltk

def process_tweet(self, text):
    """ Process and tokenize a tweet.
    Args:
        - text (str): a raw tweet in string format
    Returns: list, containing tokens after processing
    """
    
    return nltk.word_tokenize(text.lower())

run_experiment(process_tweet)

Reading file
Read file with 489609 tweets
Building vocabulary
Loading labels
Loading counts matrix
Creating TF-ID matrix
Reading file
Read file with 50000 tweets
Using vocabulary containing 10002 tokens
Loading labels
Loading counts matrix
Creating TF-ID matrix


(0.28008270108736233, 0.23821311615325777)

## Stemming

In [6]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def process_tweet(self, text):
    """ Process and tokenize a tweet.
    Args:
        - text (str): a raw tweet in string format
    Returns: list, containing tokens after processing
    """
    tokens = nltk.word_tokenize(text.lower())
    
    return [stemmer.stem(token) for token in tokens]

run_experiment(process_tweet)

Reading file
Read file with 489609 tweets
Building vocabulary
Loading labels
Loading counts matrix
Creating TF-ID matrix
Reading file
Read file with 50000 tweets
Using vocabulary containing 10002 tokens
Loading labels
Loading counts matrix
Creating TF-ID matrix


(0.280172011524204, 0.23679280728802513)

## Lemmatizing

In [16]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def process_tweet(self, text):
    """ Process and tokenize a tweet.
    Args:
        - text (str): a raw tweet in string format
    Returns: list, containing tokens after processing
    """
    tokens = nltk.word_tokenize(text.lower())
    
    return [lemmatizer.lemmatize(token) for token in tokens]

run_experiment(process_tweet)

Reading file
Read file with 489609 tweets
Building vocabulary
Loading labels
Loading counts matrix
Creating TF-ID matrix
Reading file
Read file with 50000 tweets
Using vocabulary containing 10002 tokens
Loading labels
Loading counts matrix
Creating TF-ID matrix


(0.2777279092004158, 0.23426146648482332)

## Ekphrasis

In [22]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis'},
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize
)

def process_tweet(self, text):
    """ Process and tokenize a tweet.
    Args:
        - text (str): a raw tweet in string format
    Returns: list, containing tokens after processing
    """
    return text_processor.pre_process_doc(text)

run_experiment(process_tweet)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
Reading file
Read file with 489609 tweets
Building vocabulary
Loading labels
Loading counts matrix
Creating TF-ID matrix
Reading file
Read file with 50000 tweets
Using vocabulary containing 10002 tokens
Loading labels
Loading counts matrix
Creating TF-ID matrix


(0.287221018371828, 0.2446901329507131)

## Summary

| Method        | Train Score | Dev Score  |
| ------------- | -----------:| ----------:|
| Tokenization  | 0.2801      | 0.2382     |
| Stemming      | 0.2802      | 0.2368     |
| Lemmatizing   | 0.2777      | 0.2343     |
| Ekphrasis     | **0.2872**  | **0.2447** |