In [2]:
# First Always set logging
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Covnert tweets to vectors #

### Read and create CSV corpus class ###

In [4]:
import csv


class TrainCorpus(object):
    def __init__(self, file_path):
        self.file_path = file_path
        self.gender_index = 5
        self.description_index = 10
        self.tweet_index = 19

    def __iter__(self):
        with open(self.file_path, newline='') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',', doublequote=False, quoting=csv.QUOTE_NONE)
            for row in csv_reader:
                if len(row) < 20:
                    continue
                yield [row[self.gender_index], row[self.description_index], row[self.tweet_index]]

### Preprocess data class ###

In [5]:
from gensim.utils import tokenize
from nltk.corpus import stopwords
import re


class PreprocessedCorpus(object):
    def __init__(self, corpus):
        self.corpus = corpus
        
    def __iter__(self):
        for line in train_corpus:
            gender = line[0]
            description = line[1]
            text = line[2]
            if gender not in ['male', 'female', 'brand']:
                continue

            # remove urls
            cleaned_descr = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', description, flags=re.MULTILINE)
            cleaned_text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text, flags=re.MULTILINE)

            # tokenize
            words = list(tokenize(cleaned_descr, deacc=True))
            words.extend(tokenize(cleaned_text, deacc=True))

            # remove stop words
            filtered_words = [word for word in words if word not in stopwords.words('english')]
            
            yield [gender, filtered_words]



2017-06-22 02:37:26,196 : INFO : 'pattern' package not found; tag filters are not available for English


In [6]:
from gensim.models.keyedvectors import KeyedVectors
from functools import reduce


class W2VData(object):
    def __init__(self, preprocessed_data, model_path):
        self.preprocessed_data = preprocessed_data
        self.model = KeyedVectors.load_word2vec_format(model_path, binary=True)
        self.not_in_vocab = []

    def __iter__(self):
        for data in self.preprocessed_data:
            label = data[0]
            line = data[1]

            line_vectors = []
            
            for word in line:
                if word in self.model:
                    line_vectors.append(self.model[word])
                else:
                    self.not_in_vocab.append(word)
            
            if len(line_vectors) == 0:
                continue
            elif len(line_vectors) == 1:
                centroid = line_vectors[0]
            else:
                sum_vector = reduce((lambda x, y: x + y), line_vectors)
                centroid = sum_vector / len(line_vectors)

            yield [label, centroid]

## Pipeline ##

In [7]:
train_corpus = TrainCorpus('train-data/gender-classifier-DFE-791531.csv')
preprocessed_corpus = PreprocessedCorpus(train_corpus)
w2v_data = W2VData(preprocessed_corpus, 'models/GoogleNews-vectors-negative300.bin')

count = 0
for line in w2v_data:
    count += 1
        
print('Number of words as vecs: ', count)
print('Words not available in the vocabolarity (count = %s):' % len(w2v_data.not_in_vocab))

2017-06-22 02:37:27,870 : INFO : loading projection weights from models/GoogleNews-vectors-negative300.bin


2017-06-22 02:38:13,327 : INFO : loaded (3000000, 300) matrix from models/GoogleNews-vectors-negative300.bin


Number of words as vecs:  17627
Words not available in the vocabolarity (count = 26682):
