In [16]:
import os, re, csv
import numpy as np
import pandas as pd

data_dir = "data/2017_English_final/Subtask_A/"
train_files = [
    'twitter-2013train-A.txt',
    'twitter-2013dev-A.txt',
    'twitter-2013test-A.txt',
    'twitter-2014sarcasm-A.txt', 
    'twitter-2014test-A.txt',
    'twitter-2015train-A.txt',
    'twitter-2015test-A.txt',
    'twitter-2016train-A.txt',
    'twitter-2016dev-A.txt',
    'twitter-2016devtest-A.txt',
    'twitter-2016test-A.txt',
]

In [17]:
def load_dataframe(file_path):
    return pd.read_csv(
        file_path,
        sep='\t',
        quoting=csv.QUOTE_NONE,
        usecols=[0,1,2],
        names=['id', 'label', 'message'],
        index_col=0,
        dtype={'label': 'category'}
    )

train_dfs = []
for f in train_files:
    train_dfs.append(load_dataframe(os.path.join(data_dir, f)))
    
tweets_train = pd.concat(train_dfs)
# Dropping duplicates, as mentioned in its README there are 665 duplicate annotations across and within the files of Subtask_A
tweets_train.drop_duplicates(inplace=True)
# Dropping null records, either without label, or without message
tweets_train.dropna(inplace=True)
# Randomizing the arrangement of the records
tweets_train = tweets_train.sample(frac=1.0, random_state=42)

# Clean and prepare messages
def preprocess_messages(messages):
    
    messages = messages.str.decode('unicode_escape', errors='ignore')
    messages = messages.str.strip('"') # remove left-most and right-most quotation mark
    messages = messages.str.replace('""', '"', regex=False) # replacing double quotation to single quotation

    return messages

tweets_train['message'] = preprocess_messages(tweets_train['message'])

print(f"Total number of examples for training: {len(tweets_train)}\nDistribution of classes: \n{tweets_train['label'].value_counts() / len(tweets_train)}")
tweets_train.head()

Total number of examples for training: 49675
Distribution of classes: 
neutral     0.448032
positive    0.395994
negative    0.155974
Name: label, dtype: float64


Unnamed: 0_level_0,label,message
id,Unnamed: 1_level_1,Unnamed: 2_level_1
640329403277438976,neutral,[ARIRANG] SIMPLY KPOP - Kim Hyung Jun - Cross ...
640810454730833920,neutral,@TyTomlinson just read a politico article abou...
111344128507392000,neutral,"I just typed in ""the Bazura Project"" into goog..."
641414049083691009,neutral,Fast Lerner: Subpoenaed tech guy who worked on...
637666734300905472,negative,Sony rewards app is like a lot of 19 y.o femal...


In [18]:
pd.DataFrame(tweets_train_y, columns=['label'])

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
640329403277438976,1
640810454730833920,1
111344128507392000,1
641414049083691009,1
637666734300905472,0
...,...
264260341070954497,0
641411364641206277,1
636722845599469568,1
264084248057765888,1


In [19]:
# Mapping the labels 'negative', 'neutral' and 'positive' into 0, 1, 2
tweets_train_y = tweets_train['label'].cat.codes
labels = tweets_train.label.cat.categories.tolist()
labels

['negative', 'neutral', 'positive']

In [20]:
labels_codes = {}

for i, label in enumerate(labels):
    labels_codes[label] = i

labels_codes

{'negative': 0, 'neutral': 1, 'positive': 2}

However, we cannot simply give these sentences to a machine learning model and ask it to tell us whether a review was positive or negative or neutral. We need to perform certain text preprocessing steps.

**Text Processing** - from **Text** to **Vectors**

**I. Tokenization**

In [21]:
import spacy
from sklearn.base import BaseEstimator, TransformerMixin

class TweetTokenizer(BaseEstimator, TransformerMixin):
    """
    Inherits the BaseEstimator and TransformerMixin (which contains 
    the fit and transform functions) class from sklearn
    used spacy for tokenization and lemmatization    
    """
    
    def __init__(self):
        # initializing spacy pipeline
        self.nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser', 'textcat'])
        self.stops = self.nlp.Defaults.stop_words
        
        # Removing negation words from the default stopwords set
        # cannot, no, neevr, nothing, none, without, nor, neither, nobody, nowhere
        # not is already in the default stopwords
        negation_words = ['cannot','no', 'never', 'nothing','none','without','nor','neither','nobody','nowhere']
        
        for neg in negation_words:
            self.stops.add(neg)
            
    def fit(self, X, y=None):
        return self
    
    def transform(self, messages):
        
        # Replace all whitespace characters by only one space
        messages = messages.str.replace(r'\s+', ' ', regex=True)
        messages = messages.str.strip()
        messages = messages.str.lower()
        
        # returns a lemmatized version of a token if it is not a stop word and is an alphabet character
        return messages.apply(lambda msg: " ".join([token.lemma_ for token in self.nlp(msg) if (token.is_alpha and token.lemma_.lower() not in self.stops)]))

# Let's see some examples
tweets_train_tokenized = pd.DataFrame({'message': [], 'label': []})
tweets_train_tokenized['message'] = TweetTokenizer().fit_transform(tweets_train['message'])
tweets_train_tokenized['label'] = tweets_train_y

tweets_train_tokenized.head()

Unnamed: 0_level_0,message,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
640329403277438976,arirang simply kpop kim hyung jun cross ha yeo...,1
640810454730833920,read politico article donald trump running mat...,1
111344128507392000,type bazura project google image image photo d...,1
641414049083691009,fast lerner subpoena tech guy work hillary pri...,1
637666734300905472,sony reward app like lot female singer non ret...,0


In [22]:
tweets_train_tokenized.to_csv("tweets_train_tokens.csv", index=False)