In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
files = [
    'credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv',
#    'credco_webconf_study_2_study_2_project_1_2018_02_21t22_44_07_00_00_anon_nolink.csv',
#    'credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv'
]

In [3]:
file_paths = [os.path.join('credibilitycoalition-webconf-2018', 'data', file) for file in files]
print(file_paths)

['credibilitycoalition-webconf-2018/data/credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv']


In [4]:
data = []
labels = []
for file_path in file_paths:
    with open(file_path) as csvfile:
        csv_reader = csv.reader(csvfile)
        for idx, row in enumerate(csv_reader):
            if idx == 0:
                labels.append(row)
            if idx > 0:
                data.append(row)

In [5]:
# labels are of different size but the largest one is a superset of all the others
# so we'll use the labels array that is the largest in size
label = labels[np.argmax([len(label) for label in labels])]

In [6]:
# function will extract a column of data given the index
get_data_col = lambda data, idx: [col[idx] for col in data if idx < len(col)]

# function will extract data columns given titles from the master label
get_data_col_from_titles = lambda data, titles, label: [get_data_col(data, label.index(col)) for col in titles] 

# Get Report Data

In [7]:
import re

r = re.compile('report_title')
report_title_labels = list(filter(r.match, label))
print('There are {} report_title_label columns'.format(len(report_title_labels)))
print(report_title_labels)

r = re.compile('media_content')
media_content_labels = list(filter(r.match, label))
print('There are {} media_content_label columns'.format(len(media_content_labels)))
print(media_content_labels)

r = re.compile('media_url')
media_urls_labels = list(filter(r.match, label))
print('There are {} media_urls_label columns'.format(len(media_urls_labels)))
print(media_urls_labels)

There are 1 report_title_label columns
['report_title']
There are 1 media_content_label columns
['media_content']
There are 1 media_urls_label columns
['media_url']


In [8]:
report_titles = get_data_col_from_titles(data, report_title_labels, label)[0]
print('There are {} report_title rows'.format(len(report_titles)))

media_content = get_data_col_from_titles(data, media_content_labels, label)[0]
print('There are {} media_content rows'.format(len(media_content)))

media_urls = get_data_col_from_titles(data, media_urls_labels, label)[0]
print('There are {} media_url rows'.format(len(media_urls)))

There are 50 report_title rows
There are 50 media_content rows
There are 50 media_url rows


In [9]:
print('There are {} unique report_title rows'.format(len(set(report_titles))))
print('There are {} unique media_content rows'.format(len(set(media_content))))
print('There are {} unique media_url rows'.format(len(set(media_urls))))

There are 47 unique report_title rows
There are 46 unique media_content rows
There are 50 unique media_url rows


In [10]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
print(media_content[2])

The city's gung-ho approach to development has destroyed the area's natural ability to drain away hurricane floodwaters.


In [12]:
content = "[CLS] " + media_content[2] + " [SEP]"

In [13]:
# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(content)

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

In [14]:
print(tokenized_text)
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

['[CLS]', 'the', 'city', "'", 's', 'gun', '##g', '-', 'ho', 'approach', 'to', 'development', 'has', 'destroyed', 'the', 'area', "'", 's', 'natural', 'ability', 'to', 'drain', 'away', 'hurricane', 'flood', '##water', '##s', '.', '[SEP]']
[CLS]           101
the           1,996
city          2,103
'             1,005
s             1,055
gun           3,282
##g           2,290
-             1,011
ho            7,570
approach      3,921
to            2,000
development   2,458
has           2,038
destroyed     3,908
the           1,996
area          2,181
'             1,005
s             1,055
natural       3,019
ability       3,754
to            2,000
drain        12,475
away          2,185
hurricane     7,064
flood         7,186
##water       5,880
##s           2,015
.             1,012
[SEP]           102


In [15]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
#model.eval()

In [16]:
print(tokens_tensor)
print(tokens_tensor.size())

print(segments_tensors)
print(segments_tensors.size())

tensor([[  101,  1996,  2103,  1005,  1055,  3282,  2290,  1011,  7570,  3921,
          2000,  2458,  2038,  3908,  1996,  2181,  1005,  1055,  3019,  3754,
          2000, 12475,  2185,  7064,  7186,  5880,  2015,  1012,   102]])
torch.Size([1, 29])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])
torch.Size([1, 29])


In [17]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [25]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
print(encoded_layers[0].size())
token_embeddings = torch.stack(encoded_layers, dim=0)
print(token_embeddings.size())

# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)
print(token_embeddings.size())

# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)
print(token_embeddings.size())

torch.Size([1, 29, 768])
torch.Size([12, 1, 29, 768])
torch.Size([12, 29, 768])
torch.Size([29, 12, 768])


In [19]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a [12 x 768] tensor

    # Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

Shape is: 29 x 3072


In [20]:
for i, (token_str, token_vec) in enumerate(zip(tokenized_text, token_vecs_cat)):
    print(i, token_str, token_vec.numpy())

0 [CLS] [-0.31816742  0.05438105  0.19741799 ... -0.4546941   0.21156302
  0.6953806 ]
1 the [-0.60313344 -0.31244376  0.26959664 ... -0.24248172  0.02819252
 -1.0843672 ]
2 city [-0.33740592 -0.24141443  0.24089587 ... -0.36435008  0.33033687
 -0.61303365]
3 ' [-0.26656497 -0.970989    0.47505647 ... -1.7185882  -0.11934346
  0.14042984]
4 s [-0.4484266  -1.2585407  -0.1768176  ... -1.5263525   0.27928066
  0.5566943 ]
5 gun [-0.02599564 -0.8469635   1.0729356  ... -0.53957427  0.29509512
  0.16516672]
6 ##g [-0.22292222 -0.6756145   1.0609485  ...  0.6896161  -0.93699056
  0.21629222]
7 - [-0.23356818 -0.5371182   0.7503763  ...  0.60370904  0.6619676
  0.3775569 ]
8 ho [-0.3019129  -0.46668643  1.3552309  ... -0.10347248 -0.14968055
  0.30068105]
9 approach [-0.42976162 -0.4311239  -0.09484182 ... -0.47402418 -0.49557117
 -0.13514036]
10 to [-0.31524038 -0.20892137 -0.03023395 ...  0.18992932  0.42011207
  0.23474798]
11 development [ 0.35490698  0.23821795  0.25481728 ... -0.634763