In [1]:
from google.colab import drive
drive.mount('/content/drive')
! pip install transformers

Mounted at /content/drive
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 15.9MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 51.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 48.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=

In [2]:
THRESHOLD = 1
TRAIN_DF_PATH = '/content/drive/MyDrive/train-2.csv'
VAL_DF_PATH = '/content/drive/MyDrive/val-2.csv'
TEST_DF_PATH = '/content/drive/MyDrive/evaluation.csv'
import pandas as pd
df = pd.read_csv(VAL_DF_PATH)
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,timestamp,retweet_count,user_verified,user_statuses_count,user_followers_count,user_friends_count,user_mentions,urls,hashtags,text
0,619849,619849,1588587614785,0,False,2,2,14,,,,A good ethos to adopt. Interesting to see how/...
1,515354,515354,1588289895745,6,True,25760,31964,4336,,twitter.com/i/web/status/1…,,It started with the liver sausage. Now she's p...
2,12612,12612,1588386495582,0,False,1693,17,114,,,,Praying for Trump and his followers to get sic...
3,262402,262402,1588353120964,0,False,56148,2126,966,,,,May Allah help our people
4,448560,448560,1588559750726,0,False,34793,532,312,,,,"If only they were 100% due to Corona, you mig..."


In [3]:
import numpy as np
from copy import deepcopy
from sklearn.model_selection import train_test_split
import pandas as pd

from transformers import BertTokenizer

from keras.preprocessing.sequence import pad_sequences

from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch


MAX_LEN = 150


# https://xiangyutang2.github.io/tweet-classification/
def clean_texts(df):
    # remove URL
    df['text_proc'] = df['text'].str.replace(r'http(\S)+', r'')
    df['text_proc'].fillna('', inplace=True)
    df['text_proc'] = df['text_proc'].str.replace(r'http ...', r'')
    df['text_proc'] = df['text_proc'].str.replace(r'http', r'')

    # remove RT, @
    df['text_proc'] = df['text_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+', r'')
    df['text_proc'] = df['text_proc'].str.replace(r'@[\S]+', r'')

    # remove non-ascii words and characters
    df['text_proc'] = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df['text_proc'].values]
    df['text_proc'] = df['text_proc'].str.replace(r'_[\S]?', r'')

    # remove &, < and >
    df['text_proc'] = df['text_proc'].str.replace(r'&amp;?', r'and')
    df['text_proc'] = df['text_proc'].str.replace(r'&lt;', r'<')
    df['text_proc'] = df['text_proc'].str.replace(r'&gt;', r'>')

    # remove extra space
    df['text_proc'] = df['text_proc'].str.replace(r'[ ]{2, }', r' ')

    # insert space between punctuation marks
    df['text_proc'] = df['text_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df['text_proc'] = df['text_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

    # lower case and strip white spaces at both ends
    df['text_proc'] = df['text_proc'].str.lower()
    df['text_proc'] = df['text_proc'].str.strip()
    return df


def get_train_dataloader(path, threshold, batch_size=4):
    df = pd.read_csv(path)
    df = clean_texts(df)
    texts, labels = df['text_proc'], df['retweet_count']

    inputs, targets = _get_data_for_two_class_training(texts, labels, threshold=threshold, shuffle=True)

    train_data = TensorDataset(torch.tensor(inputs), torch.tensor(targets))
    train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

    return train_dataloader


def get_val_dataloader(path, threshold, batch_size=4):
    df = pd.read_csv(path)
    df = clean_texts(df)
    texts, labels = df['text_proc'], df['retweet_count']

    inputs, targets = _get_data_for_two_class_training(texts, labels, threshold=threshold, shuffle=False)

    val_data = TensorDataset(torch.tensor(inputs), torch.tensor(targets))
    val_dataloader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

    return val_dataloader


def get_test_dataloader(path, threshold, batch_size=4):
    df = pd.read_csv(path)
    df = clean_texts(df)
    texts, labels = df['text_proc'], [0] * len(df['text_proc'])

    inputs, targets = _get_data_for_two_class_training(texts, labels, threshold=threshold, shuffle=False)

    val_data = TensorDataset(torch.tensor(inputs))
    val_dataloader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

    return val_dataloader


def _shuffle_arrays_consistently(arr1, arr2):
    assert len(arr1) == len(arr2)
    randomize = np.arange(len(arr1))
    np.random.shuffle(randomize)
    return arr1[randomize], arr2[randomize]


def _align_classes(texts, targets):
    texts, targets = _shuffle_arrays_consistently(texts, targets)

    texts_positive = texts[targets == 1]
    texts_negative = texts[targets == 0]
    targets_positive = targets[targets == 1]
    targets_negative = targets[targets == 0]

    num_examples = min(len(texts_positive), len(texts_negative))
    texts_positive = texts_positive[:num_examples]
    texts_negative = texts_negative[:num_examples]
    targets_positive = targets_positive[:num_examples]
    targets_negative = targets_negative[:num_examples]

    aligned_texts = np.concatenate([texts_positive, texts_negative])
    aligned_targets = np.concatenate([targets_positive, targets_negative])

    aligned_texts, aligned_targets = _shuffle_arrays_consistently(aligned_texts, aligned_targets)

    aligned_texts_positive = aligned_texts[aligned_targets == 1]
    aligned_texts_negative = aligned_texts[aligned_targets == 0]
    aligned_targets_positive = aligned_targets[aligned_targets == 1]
    aligned_targets_negative = aligned_targets[aligned_targets == 0]

    assert len(aligned_texts_positive) == len(aligned_texts_negative)
    assert len(aligned_targets_positive ) == len(aligned_targets_negative)
    assert len(aligned_texts_positive) == len(aligned_targets_negative)

    return aligned_texts, aligned_targets


def _encode_texts(texts):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    encoded = []
    for txt in texts:
        tokenized = tokenizer.tokenize(txt)
        if len(tokenized) > 0:
            encoded.append(tokenizer.encode(tokenized))
        else:
            encoded.append(tokenizer.encode(['']))
    return encoded


def _pad_texts(sentences, value):
    return pad_sequences(sentences, maxlen=MAX_LEN, dtype="long",
                         value=value,truncating="post", padding="post")


def _get_data_for_two_class_training(texts_in, targets_in, threshold, shuffle):
    targets = np.array([1 if a >= threshold else 0 for a in targets_in])
    texts = np.array(deepcopy(texts_in))
    if shuffle:
        texts, targets = _align_classes(texts, targets)
    tokenized_texts = _encode_texts(texts)
    padded_texts = _pad_texts(tokenized_texts, 0)

    return padded_texts, targets


In [4]:
val_dataloader = get_val_dataloader(VAL_DF_PATH, THRESHOLD, 4)
test_dataloader = get_test_dataloader(TEST_DF_PATH, THRESHOLD, 4)
train_dataloader = get_train_dataloader(TRAIN_DF_PATH, THRESHOLD, 4)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




  if self.run_code(code, result):


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup


def get_bert_for_binary_classification(train_dataloader):
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2,
    )
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5,  eps=1e-8)
    epochs = 4
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                num_training_steps=len(train_dataloader) * epochs)
    return model, optimizer, scheduler


In [7]:
model, optimizer, scheduler = get_bert_for_binary_classification(train_dataloader)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [38]:
import json
import random
import torch.nn.functional as F 


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def write_answers(model, test_dataloader, answers_file, id, stage, df_, threshold):
    df = df_.copy()
    model.eval()
    big_answers = []
    probs_for_thresholds = []
    tt = 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids = batch[0]
        with torch.no_grad():
          outputs = model(b_input_ids, token_type_ids=None)
          logits = outputs[0]
          probs = F.softmax(logits)
          probs = probs.detach().cpu().numpy()
          # assert probs.shape == (4, 2)
          for p in probs:
              probs_for_thresholds.append(p[1])
          answers = np.argmax(probs, axis=1).flatten().tolist()
          big_answers.extend(answers)
          # print(probs_for_thresholds)
          # print(big_answers)
          # break
          tt += 1
          if tt % 1000 == 0:
              print('%d/%d' % (tt, len(test_dataloader)))
    print(big_answers)
    print(probs_for_thresholds)
    df['bert_threshold_%d_answer' % threshold] = big_answers
    df['bert_threshold_%d_prob' % threshold] = probs_for_thresholds
    return df


def validate(model, validation_dataloader):
    model.eval()
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_accuracy = 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
            # if nb_eval_steps > 5:
            #   break
    print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))


def train(train_dataloader, validation_dataloader, test_dataloader, model, optimizer, scheduler,
          epochs, epoch0, save_file_template, answers_file, seed=1514, log_period=100, checkpoint_file=None):

    if checkpoint_file is not None:
        checkpoint = torch.load(checkpoint_file)
        epoch_t = checkpoint['epoch']
        assert epoch_t + 1 == epoch0
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        print('Loaded from checkpoint')

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    loss_values = []
    if epoch0 is None:
        epoch0 = 0
    for epoch_i in range(epoch0, epochs):
        t_train_accuracy = 0
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)
            model.zero_grad()
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            labels=b_labels)

            loss = outputs[0]
            total_loss += loss.item()

            logits = outputs[1]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            t_tmp_accuracy = flat_accuracy(logits, label_ids)
            t_train_accuracy += t_tmp_accuracy

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            if step % log_period == 0 and not step == 0:
                print('  Batch %d  of  %d. Loss: %f Mean Accuracy: %f' % (
                    step, len(train_dataloader), total_loss / step, t_train_accuracy / step))
            if step == 23000:
              break

        avg_train_loss = total_loss / len(train_dataloader)

        loss_values.append(avg_train_loss)
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("")
        print("Running Validation...")
        model.eval()
        
        # write_answers(model, test_dataloader, answers_file, epoch_i * 10 ** 7 + step, stage='test')
        # write_answers(model, test_dataloader, answers_file, epoch_i * 10 ** 7 + step, stage='test')
        # write_answers(model, test_dataloader, answers_file, epoch_i * 10 ** 7 + step, stage='test')
        checkpoint = {
            'epoch': epoch_i,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()}
        torch.save(checkpoint, save_file_template % epoch_i)

        validate(model, validation_dataloader)

        # print('writing for test')
        # write_answers(model, test_dataloader, answers_file, epoch_i, 'test')
        # print('writing for train')
        # train_val_dataloader = get_val_dataloader(TRAIN_DF_PATH, 1, 4)
        # write_answers(model, train_dataloader, answers_file, epoch_i, 'train')
        # print('writing for val')
        # write_answers(model, validation_dataloader, answers_file, epoch_i, 'val')


    print("")
    print("Training complete!")


In [None]:
# ANSWERS_FILE = '/content/drive/MyDrive/answers_%d_'
# MODEL_FILE = '/content/drive/MyDrive/model_%d'

# train(train_dataloader, val_dataloader, test_dataloader, model, optimizer, scheduler, 4, None,
#       answers_file=ANSWERS_FILE, save_file_template=MODEL_FILE, seed=1514,
#       log_period=500, checkpoint_file=None)


Training...
  Batch 500  of  97180. Loss: 0.505849 Mean Accuracy: 0.813500
  Batch 1000  of  97180. Loss: 0.465343 Mean Accuracy: 0.841250
  Batch 1500  of  97180. Loss: 0.461639 Mean Accuracy: 0.850500
  Batch 2000  of  97180. Loss: 0.449625 Mean Accuracy: 0.858750
  Batch 2500  of  97180. Loss: 0.440857 Mean Accuracy: 0.864700
  Batch 3000  of  97180. Loss: 0.434951 Mean Accuracy: 0.867667
  Batch 3500  of  97180. Loss: 0.423521 Mean Accuracy: 0.872143
  Batch 4000  of  97180. Loss: 0.416709 Mean Accuracy: 0.875875
  Batch 4500  of  97180. Loss: 0.415359 Mean Accuracy: 0.878056
  Batch 5000  of  97180. Loss: 0.417887 Mean Accuracy: 0.878300
  Batch 5500  of  97180. Loss: 0.416320 Mean Accuracy: 0.879955
  Batch 6000  of  97180. Loss: 0.413059 Mean Accuracy: 0.882250
  Batch 6500  of  97180. Loss: 0.415281 Mean Accuracy: 0.882769
  Batch 7000  of  97180. Loss: 0.411519 Mean Accuracy: 0.884643
  Batch 7500  of  97180. Loss: 0.411318 Mean Accuracy: 0.885567
  Batch 8000  of  97180. Los



writing for test


In [22]:
def load_from_checkpoint(filename, model, optimizer, scheduler ):
    checkpoint = torch.load(filename)
    epoch_t = checkpoint['epoch']
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    scheduler.load_state_dict(checkpoint['scheduler'])
    print('Loaded from checkpoint %s' % filename)
    return model, optimizer, scheduler

def get_new_df_with_preds(loader, df, model, stage):
    new_df = write_answers(model, loader, 'trash', 0, stage, df, THRESHOLD)
    return new_df

In [10]:
MODEL_FILE = '/content/drive/MyDrive/model_0'
model, optimizer, scheduler = load_from_checkpoint(MODEL_FILE, model, optimizer, scheduler)

Loaded from checkpoint /content/drive/MyDrive/model_0




In [11]:
# val_dataloader = get_val_dataloader(VAL_DF_PATH, THRESHOLD, 4)

In [23]:
validate(model, val_dataloader)

  Accuracy: 0.92


In [None]:
val_df = pd.read_csv(VAL_DF_PATH)
new_val_df = get_new_df_with_preds(val_dataloader, val_df, model, 'val')

new_val_df.to_csv('/content/drive/MyDrive/val_df_thr-%d.csv' % THRESHOLD)
new_val_df[['bert_threshold_%d_answer' % THRESHOLD, 'bert_threshold_%d_prob' % THRESHOLD]].sample(50)

In [None]:
print(new_val_df[(new_val_df['retweet_count'] >= THRESHOLD).astype(int) != new_val_df[('bert_threshold_%d_answer' % THRESHOLD)]].shape[0] / new_val_df.shape[0])
new_val_df[
           ['retweet_count', 'bert_threshold_%d_answer' % THRESHOLD, 'bert_threshold_%d_prob' % THRESHOLD]
           ][(new_val_df['retweet_count'] >= THRESHOLD).astype(int) != new_val_df[('bert_threshold_%d_answer' % THRESHOLD)]].sample(50)

In [34]:
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(new_val_df['bert_threshold_%d_answer' % THRESHOLD], new_val_df['retweet_count']))
print(mean_absolute_error([0] * new_val_df.shape[0], new_val_df['retweet_count']))

146.94290906906184
147.2266364264472


In [35]:
train_df = pd.read_csv(TRAIN_DF_PATH)
train_dataloader = get_val_dataloader(TRAIN_DF_PATH, THRESHOLD, 4)
new_train_df = get_new_df_with_preds(train_dataloader, train_df, model, 'train')

new_train_df.to_csv('/content/drive/MyDrive/train_df_thr-%d.csv' % THRESHOLD)
new_train_df[['bert_threshold_%d_answer' % THRESHOLD, 'bert_threshold_%d_prob' % THRESHOLD]].sample(50)

  interactivity=interactivity, compiler=compiler, result=result)
  if self.run_code(code, result):


1000/133372
2000/133372
3000/133372
4000/133372
5000/133372
6000/133372
7000/133372
8000/133372
9000/133372
10000/133372
11000/133372
12000/133372
13000/133372
14000/133372
15000/133372
16000/133372
17000/133372
18000/133372
19000/133372
20000/133372
21000/133372
22000/133372
23000/133372
24000/133372
25000/133372
26000/133372
27000/133372
28000/133372
29000/133372
30000/133372
31000/133372
32000/133372
33000/133372
34000/133372
35000/133372
36000/133372
37000/133372
38000/133372
39000/133372
40000/133372
41000/133372
42000/133372
43000/133372
44000/133372
45000/133372
46000/133372
47000/133372
48000/133372
49000/133372
50000/133372
51000/133372
52000/133372
53000/133372
54000/133372
55000/133372
56000/133372
57000/133372
58000/133372
59000/133372
60000/133372
61000/133372
62000/133372
63000/133372
64000/133372
65000/133372
66000/133372
67000/133372
68000/133372
69000/133372
70000/133372
71000/133372
72000/133372
73000/133372
74000/133372
75000/133372
76000/133372
77000/133372
78000/13

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Unnamed: 0,bert_threshold_1_answer,bert_threshold_1_prob
384542,0,0.011074
444052,0,0.011299
386519,0,0.011322
372912,0,0.010967
304823,0,0.013384
177291,1,0.996517
401460,1,0.996563
102622,0,0.01095
453481,0,0.023868
250849,0,0.023868


In [36]:
print(new_train_df[(new_train_df['retweet_count'] >= THRESHOLD).astype(int) != new_train_df[('bert_threshold_%d_answer' % THRESHOLD)]].shape[0] / new_train_df.shape[0])
new_train_df[
           ['retweet_count', 'bert_threshold_%d_answer' % THRESHOLD, 'bert_threshold_%d_prob' % THRESHOLD]
           ][(new_train_df['retweet_count'] >= THRESHOLD).astype(int) != new_train_df[('bert_threshold_%d_answer' % THRESHOLD)]].sample(50)

0.08075158204120805


Unnamed: 0,retweet_count,bert_threshold_1_answer,bert_threshold_1_prob
147001,1.0,0,0.010938
341955,1.0,0,0.011088
187105,8.0,0,0.011236
515775,5.0,0,0.023894
362002,1.0,0,0.0239
3710,631.0,0,0.023905
468975,0.0,1,0.996488
101603,1.0,0,0.011036
464276,6.0,0,0.023897
196858,7.0,0,0.010999


In [39]:
test_df = pd.read_csv(TEST_DF_PATH)
test_dataloader = get_test_dataloader(TEST_DF_PATH, THRESHOLD, 4)
new_test_df = get_new_df_with_preds(test_dataloader, test_df, model, 'test')

new_test_df.to_csv('/content/drive/MyDrive/test_df_thr-%d.csv' % THRESHOLD)
new_test_df[['bert_threshold_%d_answer' % THRESHOLD, 'bert_threshold_%d_prob' % THRESHOLD]].sample(50)



1000/71334
2000/71334
3000/71334
4000/71334
5000/71334
6000/71334
7000/71334
8000/71334
9000/71334
10000/71334
11000/71334
12000/71334
13000/71334
14000/71334
15000/71334
16000/71334
17000/71334
18000/71334
19000/71334
20000/71334
21000/71334
22000/71334
23000/71334
24000/71334
25000/71334
26000/71334
27000/71334
28000/71334
29000/71334
30000/71334
31000/71334
32000/71334
33000/71334
34000/71334
35000/71334
36000/71334
37000/71334
38000/71334
39000/71334
40000/71334
41000/71334
42000/71334
43000/71334
44000/71334
45000/71334
46000/71334
47000/71334
48000/71334
49000/71334
50000/71334
51000/71334
52000/71334
53000/71334
54000/71334
55000/71334
56000/71334
57000/71334
58000/71334
59000/71334
60000/71334
61000/71334
62000/71334
63000/71334
64000/71334
65000/71334
66000/71334
67000/71334
68000/71334
69000/71334
70000/71334
71000/71334


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Unnamed: 0,bert_threshold_1_answer,bert_threshold_1_prob
153409,0,0.011087
90679,1,0.996446
32558,1,0.996574
96736,0,0.02387
148998,0,0.023883
60514,0,0.011001
114792,0,0.010984
195565,0,0.023878
200996,1,0.99657
172170,0,0.018205


In [None]:
val_df = pd.read_csv(VAL_DF_PATH)
new_val_df = get_new_df_with_preds(val_dataloader, val_df, model, 'val')

new_val_df.to_csv('/content/drive/MyDrive/val_df_thr-%d.csv' % THRESHOLD)
new_val_df[['bert_threshold_%d_answer' % THRESHOLD, 'bert_threshold_%d_prob' % THRESHOLD]].sample(50)

In [20]:
new_val_df

In [None]:
val_df = pd.read_csv(VAL_DF_PATH)
new_val_df = write_answers(model, val_dataloader, ANSWERS_FILE, 0, 'val', val_df, THRESHOLD)
new_val_df.to_csv('/content/drive/MyDrive/val_with_probs_threshold_1.csv')

[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 

In [None]:
val_df = pd.read_csv(VAL_DF_PATH)

with open((ANSWERS_FILE % 0) + 'val') as f:
  ans = json.load(f)
ans[:10]

  interactivity=interactivity, compiler=compiler, result=result)


[1, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [None]:
val_df['betr_threshold_1'] = ans

In [None]:
VAL_DF_PATH_NEW = '/content/drive/MyDrive/val_with_bert.csv'

In [None]:
val_df.to_csv(VAL_DF_PATH_NEW)

In [None]:
val_df_t = val_df[['betr_threshold_1', 'retweet_count']]
val_df_t.head(50)

In [None]:
test_dataloader = get_test_dataloader(TEST_DF_PATH, 1, 4)

In [None]:
ANSWERS_FILE = '/content/drive/MyDrive/answers_%d_'
write_answers(model, test_dataloader, ANSWERS_FILE, 0, 'test')

In [None]:
test_df = pd.read_csv(TEST_DF_PATH)


In [None]:
TEST_DF_PATH_NEW = '/content/drive/MyDrive/test_with_bert.csv'

In [None]:
with open((ANSWERS_FILE % 0) + 'test') as f:
  ans = json.load(f)
ans[:10]

[0, 0, 0, 1, 0, 0, 0, 1, 1, 0]

In [None]:
test_df['betr_threshold_1'] = ans

In [None]:
test_df.to_csv(TEST_DF_PATH_NEW)

In [None]:
import csv

with open("gbr_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(ans):
        writer.writerow([str(test_df['id'].iloc[index]) , str(int(prediction))])