In [0]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
import warnings
import logging
from gensim.models import word2vec
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import time

In [0]:
train = pd.read_csv('labeledTrainData.tsv',
                    header=0,
                    delimiter='\t',
                    quoting=3)

test = pd.read_csv('testData.tsv',
                   header=0,
                   delimiter='\t',
                   quoting=3)

unlabeled_train = pd.read_csv('unlabeledTrainData.tsv',
                              header=0,
                              delimiter='\t',
                              quoting=3)

In [3]:
print('Train reviews {}'.format(train['review'].size))
print('Test reviews {}'.format(test['review'].size))
print('Unlabeled train reviews {}'.format(unlabeled_train['review'].size))

Train reviews 25000
Test reviews 25000
Unlabeled train reviews 50000


In [0]:
def review_to_wordlist(review, remove_stopwords=False):
    '''Convert review to a string of words'''
  
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    words = review_text.lower().split()
  
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
  
    return(words)

In [5]:
# Punkt tokenizer for sentence splitting
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    '''Split review into sentences, where sentence is a list of words'''
  
    # split review into list of sentences
    raw_sentences = tokenizer.tokenize(review.strip()) # and remove blank spaces
  
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
        # make a list of words from each sentence
        sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
      
    # list of lists
    return sentences

In [9]:
# Supress bs warnings when review contains links
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# Prepare data for input for word2vec
sentences = []

print('Parsing sentences from training set')
for review in train['review']:
    sentences += review_to_sentences(review, tokenizer)
    
print('Parsing sentences from unlabeled set')
for review in unlabeled_train['review']:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set
Parsing sentences from unlabeled set


In [10]:
print(len(sentences))
print(sentences[0])

795538
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [0]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
# Model parameters
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

In [13]:
print('Training model...')
model = word2vec.Word2Vec(sentences,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)

2019-07-28 17:03:11,994 : INFO : collecting all words and their counts
2019-07-28 17:03:11,996 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-07-28 17:03:12,065 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2019-07-28 17:03:12,128 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types


Training model...


2019-07-28 17:03:12,194 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2019-07-28 17:03:12,265 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2019-07-28 17:03:12,330 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2019-07-28 17:03:12,400 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2019-07-28 17:03:12,470 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2019-07-28 17:03:12,535 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2019-07-28 17:03:12,609 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2019-07-28 17:03:12,675 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2019-07-28 17:03:12,740 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 

In [14]:
# Keep only normalized vectors. Model is read-only after replacing (no training)

model.init_sims(replace=True)

2019-07-28 17:06:44,998 : INFO : precomputing L2-norms of word weight vectors


In [15]:
model_name = '300features_40minwords_10context'
model.save(model_name)

2019-07-28 17:06:45,159 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2019-07-28 17:06:45,168 : INFO : not storing attribute vectors_norm
2019-07-28 17:06:45,177 : INFO : not storing attribute cum_table
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-28 17:06:45,863 : INFO : saved 300features_40minwords_10context


In [16]:
# Return most far word 
warnings.filterwarnings('ignore') # ignore future warnings
model.wv.doesnt_match('man woman child kitchen'.split())

'kitchen'

In [17]:
model.wv.doesnt_match('france england germany berlin'.split())

'berlin'

In [18]:
# Return most similar words
model.wv.most_similar('plot')

[('storyline', 0.8043239116668701),
 ('story', 0.6558642387390137),
 ('plots', 0.6404682397842407),
 ('premise', 0.6055775880813599),
 ('narrative', 0.5908520221710205),
 ('script', 0.5694664716720581),
 ('plotline', 0.5232638120651245),
 ('dialog', 0.5013354420661926),
 ('continuity', 0.49053072929382324),
 ('scenario', 0.48579367995262146)]

In [19]:
model = Word2Vec.load('300features_40minwords_10context')

2019-07-28 17:06:48,560 : INFO : loading Word2Vec object from 300features_40minwords_10context
2019-07-28 17:06:48,933 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2019-07-28 17:06:48,934 : INFO : setting ignored attribute vectors_norm to None
2019-07-28 17:06:48,935 : INFO : loading vocabulary recursively from 300features_40minwords_10context.vocabulary.* with mmap=None
2019-07-28 17:06:48,942 : INFO : loading trainables recursively from 300features_40minwords_10context.trainables.* with mmap=None
2019-07-28 17:06:48,945 : INFO : setting ignored attribute cum_table to None
2019-07-28 17:06:48,947 : INFO : loaded 300features_40minwords_10context


In [20]:
# Mapping word-numpy array 
print(type(model.wv))

<class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>


In [21]:
# Feature vector for every word in vocabulary
print(type(model.wv.vectors))
print(model.wv.vectors.shape)
#model.wv['cat'] # fearure vector for concrete word

<class 'numpy.ndarray'>
(16490, 300)


In [22]:
# List with words in vocabulary
print(model.wv.index2word[:10])
print('Number of words in vocabulary:', len(model.wv.index2word))

['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
Number of words in vocabulary: 16490


In [0]:
def make_feature_vector(words, model, num_features):
    '''Average all word vectors in review'''
  
    feature_vector = np.zeros((num_features,), dtype='float32')
    nwords = 0 # number of words to average
  
    # Convert list of vocabulary words to set
    index2word_set = set(model.wv.index2word)
  
    # For all words in review
    for word in words:
        if word in index2word_set:
        nwords += 1
        # add feature vector of word to total
        feature_vector = np.add(feature_vector, model.wv[word])
      
    # Get the average
    feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

In [0]:
def get_avg_feature_vectors(reviews, model, num_features):
    '''Calculate average feature vector for each review'''
    
    counter = 0
    review_feature_vectors = np.zeros((len(reviews), num_features), dtype='float32')
  
    for review in reviews:
        if counter % 1000 == 0:
        print('Review {0} of {1}'.format(counter, len(reviews)))
      
        # make average feature vectors for review
        review_feature_vectors[counter] = make_feature_vector(review, model, num_features)
        counter += 1
    return review_feature_vectors

In [25]:
# Calculate average feature vectors for reviews in train and test

clean_train_reviews = []

print('Creating feature vectors for train reviews...')

for review in train['review']:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))

train_data_vectors = get_avg_feature_vectors(clean_train_reviews, model, num_features)

print('Creating feature vectors for test reviews...')

clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))
    
test_data_vectors = get_avg_feature_vectors(clean_test_reviews, model, num_features)

Creating feature vectors for train reviews...
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Creating feature vectors for test reviews...
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 1700

In [26]:
forest = RandomForestClassifier(n_estimators = 100)
print('Fitting a random forest to labeled training data...')
forest.fit(train_data_vectors, train['sentiment'])

Fitting a random forest to labeled training data...


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
result = forest.predict(test_data_vectors)

In [0]:
output = pd.DataFrame({'id': test['id'], 'sentiment': result})
output.to_csv('Word2Vec_AverageVectors.csv', index=False, quoting=3) # 0.83672 roc auc

### Grouping vectors in clusters

In [0]:
start = time.time()
word_vectors = model.wv.vectors # vectors for words in vocabulary
num_clusters = word_vectors.shape[0] // 5 # make (1/5 vocabulary size)) clusters

In [0]:
kmeans = KMeans(n_clusters=num_clusters)
idx = kmeans.fit_predict(word_vectors) # predict cluster centroid index for each word in vocabulary

In [31]:
end = time.time()
elapsed = end - start
print('Time for k-Means clustering with {0} clusters: {1} seconds'.format(num_clusters,
                                                                         elapsed))
print(kmeans)

Time for k-Means clustering with 3298 clusters: 1680.4199352264404 seconds
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3298, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)


In [0]:
# Create a dictionary with key -> word from vocabulary and value -> number of cluster centroid

word_centroid_map = dict(zip(model.wv.index2word, idx))

In [33]:
# Explore the first 10 clusters

for cluster in range(10):
    print('Cluster #', cluster)
  
    words = []
    for cluster_number in range(len(word_centroid_map.values())):
        if list(word_centroid_map.values())[cluster_number] == cluster:
        words.append(list(word_centroid_map.keys())[cluster_number])
    print(words)

Cluster # 0
['james', 'sir', 'ian', 'craig', 'ned', 'olivier', 'liam', 'everett', 'reliable', 'sterling', 'rupert', 'scrooge', 'laurence', 'laughton', 'granger', 'alain', 'rea', 'jackman', 'oates', 'fishburne', 'holm', 'mcshane']
Cluster # 1
['industrial', 'traditions', 'freely', 'customs']
Cluster # 2
['horse', 'bike']
Cluster # 3
['achieves', 'avoids', 'entertains', 'defines', 'renders', 'ensures', 'elevates', 'accomplishes', 'provokes']
Cluster # 4
['ahem', 'decrepit']
Cluster # 5
['casino', 'mines', 'ivory', 'rightful']
Cluster # 6
['bart', 'kolchak', 'joss', 'crockett', 'fatty']
Cluster # 7
['dropping', 'vomiting']
Cluster # 8
['steam', 'pi', 'stroll', 'crawls', 'slammed', 'slides', 'bounces']
Cluster # 9
['sweet', 'gentle', 'passionate', 'tender']


In [0]:
# Create bags of centroids

def create_bag_of_centroids(wordlist, word_centroid_map):
    bag_of_centroids = np.zeros((num_clusters,), dtype='float32') # (3298,)
  
    for word in wordlist:
        if word in word_centroid_map:
        index = word_centroid_map[word]
        bag_of_centroids[index] += 1
      
    return bag_of_centroids # array (num_clusters,)  

In [0]:
# Bags of centroids for train and test set

train_centroids = np.zeros((train['review'].shape[0], num_clusters), dtype='float32')
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

test_centroids = np.zeros((test['review'].shape[0], num_clusters), dtype='float32')
counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [36]:
forest = RandomForestClassifier(n_estimators = 100)

print('Fitting a random forest to labeled training data...')
forest = forest.fit(train_centroids, train['sentiment'])
result = forest.predict(test_centroids)

Fitting a random forest to labeled training data...


In [0]:
output = pd.DataFrame({'id': test['id'], 'sentiment': result})
output.to_csv('BagOfCentroids.csv', index=False, quoting=3) # 0.84496 roc auc