In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from nltk.stem import PorterStemmer
import preprocessor as p

ps = PorterStemmer()

In [2]:
with open('englishST.txt', 'r', encoding='utf-8-sig') as f:
    stop = [s.replace('\n', '') for s in f.readlines()]

In [3]:
with open('tweets/Tweets.14cat.train', 'r', encoding='utf-8-sig', errors='ignore') as f:
    lines = f.readlines()
with open('tweets/Tweets.14cat.test', 'r', encoding='utf-8-sig', errors='ignore') as f:
    lines_test = f.readlines()

In [4]:
p.tokenize(lines[0])

'$NUMBER$ Furniture for - so cute! gotta show my $HASHTAG$ mama the last one especially $SMILEY$ $URL$ $URL$ Pets & Animals'

In [5]:
p.tokenize(lines[2].split('\t')[1])

'CATS ART $URL$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$ $HASHTAG$'

In [6]:
def remove_links(text):
    return re.sub(r'http\S+', '', text)

In [7]:
def preprocess(line):
    tweet_ID, tweet, category = tuple(line.split('\t'))
    tweet_ID = int(tweet_ID)
    tweet = remove_links(tweet)
    return tweet_ID, tweet, category

In [8]:
def tokenize(text):
    # add hash_tag content as well
    return re.findall(r'[#]?\w+', text)

In [9]:
def build_vocabs_dict(lines):
    df = pd.DataFrame(columns=['ID', 'tweet', 'category'])
    vocabs = set()
    for line in lines:
        line = line.replace('\n', '')
        if line:
            _, tweet, _ = preprocess(line)
            for t in tokenize(tweet):
                t = t.lower()
                if not t in stop:
                    vocabs.add(ps.stem(t))
    print(len(vocabs))
    vocabs = sorted(list(vocabs))
    dic = dict()
    dic_reverse = dict()
    for i in range(len(vocabs)):
        dic[vocabs[i]] = i
        dic_reverse[i] = vocabs[i]
    return dic, dic_reverse

In [10]:
feats_dict, feats_dict_reverse = build_vocabs_dict(lines)
# feats_dict['<UNK>'] = len(feats_dict)
feats_dict['#'] = len(feats_dict)

9570


In [11]:
with open('tweets/classIDs.txt', 'r', encoding='utf-8-sig') as f:
    raw_IDs = f.readlines()

class_IDs = dict()
for raw_ID in raw_IDs:
    raw_ID = raw_ID.replace('\n', '')
    _class, _id = tuple(raw_ID.split('\t'))
    class_IDs[_class] = _id

class_IDs

{'Autos & Vehicles': '1',
 'Comedy': '2',
 'Education': '3',
 'Entertainment': '4',
 'Film & Animation': '5',
 'Gaming': '6',
 'Howto & Style': '7',
 'Music': '8',
 'News & Politics': '9',
 'Nonprofits & Activism': '10',
 'Pets & Animals': '11',
 'Science & Technology': '12',
 'Sports': '13',
 'Travel & Events': '14'}

In [12]:
def vectorize(line, vocabs, classes, vocabs_reverse):
    
    tweet_ID, tweet, category = preprocess(line)
    tokens = tokenize(tweet)
    class_id = classes[category]
    feats = []
    for tok in tokens:
        
        # apply stemming and stopping
        tok = tok.lower()
        if tok in stop:
            continue
        tok = ps.stem(tok)
            
        if vocabs.get(tok):
            feats.append(vocabs[tok])
        else:
            # feats.append(vocabs['<UNK>'])
            pass
    result = str(class_id) + ' ' 
    # preserve word count
    for feat, count in Counter(sorted(feats)).items():
        if '#' in vocabs_reverse[feat]:
            # print(vocabs_reverse[feat])
            result += str(feat) + ':' + str(count + 1.0) + ' '
        else:
            result += str(feat) + ':' + str(1) + ' ' 
        
    # add presenece of hash_tag
    if '$HASHTAG$' in p.tokenize(tweet):
        count = len(re.findall('$HASHTAG$', tweet))
        result += str(vocabs['#']) + ':' + str(count) + ' '
        
    result += '#' + str(tweet_ID)
    return result

vectorize(lines[100].replace('\n', ''), feats_dict, class_IDs, feats_dict_reverse)

'12 4358:1 5195:1 5396:1 5637:1 5674:1 5701:1 5814:1 6498:1 6574:1 7426:1 7992:1 8399:1 #45189568820570931'

In [13]:
def generate_vectorized_data(lines, vocabs, classes, vocabs_reverse, split='train'):
    with open('feats.' + split, 'w+') as f:
        for line in lines:
            line = line.replace('\n', '')
            if line:
                f.write(vectorize(line, vocabs, classes, vocabs_reverse) + '\n')

In [14]:
generate_vectorized_data(lines, feats_dict, class_IDs, feats_dict_reverse)
generate_vectorized_data(lines_test, feats_dict, class_IDs, feats_dict_reverse, split='test')