In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
with open('tweets/Tweets.14cat.train', 'r', encoding='utf-8-sig', errors='ignore') as f:
    lines = f.readlines()
with open('tweets/Tweets.14cat.test', 'r', encoding='utf-8-sig', errors='ignore') as f:
    lines_test = f.readlines()

In [3]:
def remove_links(text):
    return re.sub(r'http\S+', '', text)

In [4]:
def preprocess(line):
    tweet_ID, tweet, category = tuple(line.split('\t'))
    tweet_ID = int(tweet_ID)
    tweet = remove_links(tweet)
    return tweet_ID, tweet, category

In [5]:
def tokenize(text):
    return re.findall(r'\w+', text)

In [6]:
def build_vocabs_dict(lines):
    df = pd.DataFrame(columns=['ID', 'tweet', 'category'])
    vocabs = set()
    for line in lines:
        line = line.replace('\n', '')
        if line:
            _, tweet, _ = preprocess(line)
            for t in tokenize(tweet):
                vocabs.add(t.lower())
    print(len(vocabs))
    vocabs = sorted(list(vocabs))
    dic = dict()
    for i in range(len(vocabs)):
        dic[vocabs[i]] = i
    return dic

In [7]:
feats_dict = build_vocabs_dict(lines)
feats_dict['<UNK>'] = len(feats_dict)
len(feats_dict)

10167


10168

In [8]:
with open('tweets/classIDs.txt', 'r', encoding='utf-8-sig') as f:
    raw_IDs = f.readlines()

class_IDs = dict()
for raw_ID in raw_IDs:
    raw_ID = raw_ID.replace('\n', '')
    _class, _id = tuple(raw_ID.split('\t'))
    class_IDs[_class] = _id

class_IDs

{'Autos & Vehicles': '1',
 'Comedy': '2',
 'Education': '3',
 'Entertainment': '4',
 'Film & Animation': '5',
 'Gaming': '6',
 'Howto & Style': '7',
 'Music': '8',
 'News & Politics': '9',
 'Nonprofits & Activism': '10',
 'Pets & Animals': '11',
 'Science & Technology': '12',
 'Sports': '13',
 'Travel & Events': '14'}

In [9]:
def vectorize(line, vocabs, classes):
    tweet_ID, tweet, category = preprocess(line)
    tokens = tokenize(tweet)
    class_id = classes[category]
    feats = []
    for tok in tokens:
        if vocabs.get(tok.lower()):
            feats.append(vocabs[tok.lower()])
        else:
            feats.append(vocabs['<UNK>'])
    result = str(class_id) + ' ' 
    for feat in list(sorted(set(feats))):
        result += str(feat) + ':1 ' 
    result += '#' + str(tweet_ID)
    return result

vectorize(lines[0].replace('\n', ''), feats_dict, class_IDs)

'11 2342:1 3114:1 3515:1 3646:1 3908:1 3932:1 5192:1 5595:1 6104:1 6473:1 8115:1 8304:1 8978:1 #45029314109075046'

In [10]:
def generate_vectorized_data(lines, vocabs, classes, split='train'):
    with open('feats.' + split, 'w+') as f:
        for line in lines:
            line = line.replace('\n', '')
            if line:
                f.write(vectorize(line, vocabs, classes) + '\n')

In [11]:
generate_vectorized_data(lines, feats_dict, class_IDs)
generate_vectorized_data(lines_test, feats_dict, class_IDs, split='test')