In [1]:
from google.colab import drive
drive.mount('/content/drive')
! pip install transformers

Mounted at /content/drive
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 13.7MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 51.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 49.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=

In [87]:
TRAIN_DF_PATH = '/content/drive/MyDrive/train-2.csv'
VAL_DF_PATH = '/content/drive/MyDrive/val-2.csv'
TEST_DF_PATH = '/content/drive/MyDrive/evaluation.csv'

## Analytics

In [89]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv(VAL_DF_PATH)
df[df.retweet_count.isna()].shape
df['retweet_count'].fillna(0, inplace=True)
df.shape

(133156, 12)

In [90]:
def get_thresholds_and_medians():
    thresholds = [1, 10, 50, 100, 300, 1000]
    medians = [0, 2, 20, 69, 163, 507, 2832]
    return thresholds, medians

In [91]:
THRESHOLDS, MEDIANS = get_thresholds_and_medians()
print(THRESHOLDS)
print(MEDIANS)
INDEX2MEDIAN = MEDIANS
MEDIAN2INDEX = {m: i for i, m in enumerate(INDEX2MEDIAN)}
print(MEDIAN2INDEX)

[1, 10, 50, 100, 300, 1000]
[0, 2, 20, 69, 163, 507, 2832]
{0: 0, 2: 1, 20: 2, 69: 3, 163: 4, 507: 5, 2832: 6}


In [92]:
import numpy as np
from copy import deepcopy
from sklearn.model_selection import train_test_split
import pandas as pd

from transformers import BertTokenizer

from keras.preprocessing.sequence import pad_sequences

from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch


MAX_LEN = 150


# https://xiangyutang2.github.io/tweet-classification/
def clean_texts(df):
    # remove URL
    df['text_proc'] = df['text'].str.replace(r'http(\S)+', r'')
    df['text_proc'].fillna('', inplace=True)
    df['text_proc'] = df['text_proc'].str.replace(r'http ...', r'')
    df['text_proc'] = df['text_proc'].str.replace(r'http', r'')

    # remove RT, @
    df['text_proc'] = df['text_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+', r'')
    df['text_proc'] = df['text_proc'].str.replace(r'@[\S]+', r'')

    # remove non-ascii words and characters
    df['text_proc'] = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df['text_proc'].values]
    df['text_proc'] = df['text_proc'].str.replace(r'_[\S]?', r'')

    # remove &, < and >
    df['text_proc'] = df['text_proc'].str.replace(r'&amp;?', r'and')
    df['text_proc'] = df['text_proc'].str.replace(r'&lt;', r'<')
    df['text_proc'] = df['text_proc'].str.replace(r'&gt;', r'>')

    # remove extra space
    df['text_proc'] = df['text_proc'].str.replace(r'[ ]{2, }', r' ')

    # insert space between punctuation marks
    df['text_proc'] = df['text_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df['text_proc'] = df['text_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

    # lower case and strip white spaces at both ends
    df['text_proc'] = df['text_proc'].str.lower()
    df['text_proc'] = df['text_proc'].str.strip()
    return df


def get_train_dataloader(path, batch_size=4):
    df = pd.read_csv(path)
    df['retweet_count'].fillna(0, inplace=True)
    print('cleaning')
    df = clean_texts(df)
    texts, labels = df['text_proc'], df['retweet_count']

    print('get_data_for_two_class_training')
    inputs, targets = _get_data_for_two_class_training(texts, labels, shuffle=True)

    train_data = TensorDataset(torch.tensor(inputs), torch.tensor(targets))
    train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

    return train_dataloader


def get_val_dataloader(path, batch_size=4):
    df = pd.read_csv(path)
    df['retweet_count'].fillna(0, inplace=True)
    print('cleaning')
    df = clean_texts(df)
    texts, labels = df['text_proc'], df['retweet_count']

    print('get_data_for_two_class_training')
    inputs, targets = _get_data_for_two_class_training(texts, labels, shuffle=False)

    val_data = TensorDataset(torch.tensor(inputs), torch.tensor(targets))
    val_dataloader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

    return val_dataloader


def get_test_dataloader(path, batch_size=4):
    df = pd.read_csv(path)
    print('cleaning')
    df = clean_texts(df)
    texts, labels = df['text_proc'], np.array([0] * len(df['text_proc']))

    print('get_data_for_two_class_training')
    inputs, targets = _get_data_for_two_class_training(texts, labels, shuffle=False)

    val_data = TensorDataset(torch.tensor(inputs))
    val_dataloader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

    return val_dataloader


def _shuffle_arrays_consistently(arr1, arr2):
    assert len(arr1) == len(arr2)
    randomize = np.arange(len(arr1))
    np.random.shuffle(randomize)
    return arr1[randomize], arr2[randomize]


def _align_classes(texts, targets):
    texts, targets = _shuffle_arrays_consistently(texts, targets)

    classes_texts = []
    classes_targets = []
    left = 0
    for median in MEDIANS:
        classes_texts.append(texts[targets == MEDIAN2INDEX[median]])
        classes_targets.append(targets[targets == MEDIAN2INDEX[median]])

    test_len1, test_len2 = 0, 0
    for i in classes_texts:
      test_len1 += len(i)
    for i in classes_targets:
      test_len2 += len(i)
    assert test_len1 == len(texts)
    assert test_len2 == len(targets)

    num_examples = min([len(a) for a in classes_texts])

    for i in range(len(classes_targets)):
        classes_texts[i] = classes_texts[i][:num_examples]
        classes_targets[i] = classes_targets[i][:num_examples]

    aligned_texts = np.concatenate(classes_texts)
    aligned_targets = np.concatenate(classes_targets)

    aligned_texts, aligned_targets = _shuffle_arrays_consistently(aligned_texts, aligned_targets)

    return aligned_texts, aligned_targets


def _encode_texts(texts):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    encoded = []
    for txt in texts:
        tokenized = tokenizer.tokenize(txt)
        if len(tokenized) > 0:
            encoded.append(tokenizer.encode(tokenized))
        else:
            encoded.append(tokenizer.encode(['']))
    return encoded


def _pad_texts(sentences, value):
    return pad_sequences(sentences, maxlen=MAX_LEN, dtype="long",
                         value=value,truncating="post", padding="post")


def _get_data_for_two_class_training(texts_in, targets_in, shuffle):
    targets = np.array([0] * len(targets_in))
    left = 0
    for threshold, median in zip(THRESHOLDS, MEDIANS):
        targets[(targets_in >= left) & (targets_in < threshold)] = MEDIAN2INDEX[median]
        left = threshold
    targets[targets_in >= left] = MEDIAN2INDEX[MEDIANS[-1]]

    texts = np.array(deepcopy(texts_in))
    if shuffle:
        texts, targets = _align_classes(texts, targets)
    tokenized_texts = _encode_texts(texts)
    padded_texts = _pad_texts(tokenized_texts, 0)

    return padded_texts, targets


In [93]:
val_dataloader = get_val_dataloader(VAL_DF_PATH, 4)
test_dataloader = get_test_dataloader(TEST_DF_PATH, 4)

cleaning
get_data_for_two_class_training
cleaning
get_data_for_two_class_training


In [33]:
jj = 0
for i in val_dataloader:
  print(i[1], i[0][0][:10])
  jj += 1
  if jj == 10:
    break

tensor([1, 0, 0, 0]) tensor([  101, 15117,  2651,  1024,  7020,  2072, 15991,  2040,  2106,  2025])
tensor([1, 0, 1, 0]) tensor([  101,  2129,  2003, 21887, 23350,  4254,  2075,  1996,  2088,  1005])
tensor([1, 0, 0, 0]) tensor([ 101, 3477, 3086, 1010, 2637,  102,    0,    0,    0,    0])
tensor([5, 0, 1, 0]) tensor([  101,  1001,  3422,  1064, 11757,  3048,  3496,  2004,  3727,  2902])
tensor([4, 0, 0, 0]) tensor([  101,  1996,  2466,  2369, 11268,  1012, 10294, 11764,  1005,  1055])
tensor([0, 0, 2, 1]) tensor([ 101, 2029, 2003, 1012, 1012, 1012, 1012, 2382, 2454, 4841])
tensor([0, 0, 0, 0]) tensor([ 101, 2145, 1996, 2028,  102,    0,    0,    0,    0,    0])
tensor([0, 0, 1, 0]) tensor([ 101, 3374, 2005, 2115, 3279, 1006, 1998, 2035, 1996, 6409])
tensor([2, 0, 5, 1]) tensor([  101,  2397,  2595, 11875,  2024,  2025, 25403,  1998,  2071,  3659])
tensor([1, 0, 0, 1]) tensor([  101,  2057,  1006,  1996,  7327,  1007,  2342,  1015, 23458,  2058])


In [34]:
print(len(val_dataloader))

45094


In [37]:
jj = 0
for i in test_dataloader:
  print(i[0][0][:10])
  jj += 1
  if jj == 10:
    break

tensor([  101, 21887, 23350,  1010,  2053,  3500,  3338,  1010,  3782,  2829])
tensor([ 101, 2502, 2567, 2003, 2025, 2200, 2204, 2012, 3666, 2017])
tensor([  101, 18243, 24014,  2050,  6676,  1010,  1037,  2304,  7155,  1998])
tensor([  101,  2054,  2050, 20160, 17369,  6358,  2003,  1012,   102,     0])
tensor([  101, 15936,  2000,  2022,  2112,  1997,  2107,  1037, 10392,  2451])
tensor([  101,  2153,  2009,  1005,  1055,  2877,  1996,  3715,  2005, 15855])
tensor([ 101, 2024, 2057, 2183, 2000, 2131, 2921, 1999, 2065, 2057])
tensor([ 101, 6203, 2643, 3531, 2191, 2023, 2644, 1012, 2023, 2003])
tensor([ 101, 2036, 1999, 2023, 8554, 1010, 2069, 4720, 1003, 2056])
tensor([ 101, 7143, 2111, 2097, 2079, 7143, 2477, 1012, 2023, 2003])


In [35]:
print(len(test_dataloader))

71334


In [94]:
train_dataloader = get_train_dataloader(TRAIN_DF_PATH, 4)

  if self.run_code(code, result):


cleaning
get_data_for_two_class_training


In [52]:
jj = 0
for i in train_dataloader:
  print(i[1], i[0][0][:20])
  jj += 1
  if jj == 10:
    break

tensor([1, 0, 3, 5]) tensor([  101, 12436,  1012,  3099,  8908,  6923,  3844,  7698,  2083,  2089,
         2403,  1010,  2758,  2034,  4403,  1997,  2128, 26915,  2075,  2071])
tensor([4, 0, 3, 5]) tensor([  101,  2408,  1996,  2088,  1010,  2308,  2031,  2042,  2625,  3497,
         2000,  2468, 11325,  2135,  5665,  2013,  1996, 21887, 23350,  1998])
tensor([1, 4, 0, 3]) tensor([  101, 23564,  3089,  2546,  1024,  2149,  2749,  2031,  2053,  2449,
         1021,  1010,  2199,  2661,  2185,  2013,  2188, 19804,  2603,  1011])
tensor([5, 6, 6, 1]) tensor([ 101, 1045, 2293, 8038, 3363, 1012, 2619, 3791, 2000, 2963, 2008, 2651,
        1012, 2123, 2102, 2022, 4452, 1012, 2017, 2024])
tensor([5, 1, 6, 1]) tensor([101, 100, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0])
tensor([0, 4, 1, 0]) tensor([  101,  8398,  1998, 10643,  2024,  4129,  4841,  2000,  8568,  2591,
         3292,  1012,  3280,  2005,  1037,  3105,  1029,   102,     0, 

In [53]:
print(len(train_dataloader))

14466


In [95]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [96]:
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup


def get_bert_for_binary_classification(train_dataloader):
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(MEDIANS),
    )
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5,  eps=1e-8)
    epochs = 4
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                num_training_steps=len(train_dataloader) * epochs)
    return model, optimizer, scheduler


In [101]:
model, optimizer, scheduler = get_bert_for_binary_classification(train_dataloader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [102]:
import json
import random
from torch.nn import BCEWithLogitsLoss


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def write_answers(model, test_dataloader, answers_file, id, stage):
    model.eval()
    big_answers = []
    tt = 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids = batch[0]
        with torch.no_grad():
          outputs = model(b_input_ids, token_type_ids=None)
          logits = outputs[0]
          logits = logits.detach().cpu().numpy()
          answers = np.argmax(logits, axis=1).flatten().tolist()
          answers = [INDEX2MEDIAN[a] for a in answers]
          big_answers.extend(answers)
          tt += 1
          if tt % 1000 == 0:
              print('%d/%d' % (tt, len(test_dataloader)))
    print(big_answers)
    with open((answers_file + stage) % id, 'w') as outfile:
        json.dump(big_answers, outfile)


def validate(model, validation_dataloader):
    model.eval()
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_accuracy = 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
            # if nb_eval_steps > 5:
            #   break
    print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))


def train(train_dataloader, validation_dataloader, test_dataloader, model, optimizer, scheduler,
          epochs, epoch0, save_file_template, answers_file, seed=1514, log_period=100, checkpoint_file=None):

    if checkpoint_file is not None:
        checkpoint = torch.load(checkpoint_file)
        epoch_t = checkpoint['epoch']
        assert epoch_t + 1 == epoch0
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        print('Loaded from checkpoint')

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    loss_values = []
    if epoch0 is None:
        epoch0 = 0
    for epoch_i in range(epoch0, epochs):
        t_train_accuracy = 0
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)
            # print(b_labels)
            model.zero_grad()
            outputs = model(b_input_ids,
                            token_type_ids=None)

            loss_func = BCEWithLogitsLoss()

            # loss = outputs[0]
            # total_loss += loss.item()

            logits = outputs[0]
            b_labels_logits = torch.zeros_like(logits)
            for i in range(len(b_labels)):
                if b_labels[int(i)] > 0 and b_labels[int(i)] < len(MEDIANS) - 1:
                    b_labels_logits[i][b_labels[int(i)]] = 0.8
                    b_labels_logits[i][b_labels[int(i)] + 1] = 0.1
                    b_labels_logits[i][b_labels[int(i)] - 1] = 0.1
                elif b_labels[int(i)] > 0:
                    b_labels_logits[i][b_labels[int(i)]] = 0.9
                    b_labels_logits[i][b_labels[int(i)] - 1] = 0.1
                else:
                    b_labels_logits[i][b_labels[int(i)]] = 0.9
                    b_labels_logits[i][b_labels[int(i)] + 1] = 0.1
                # b_labels_logits[i][b_labels[int(i)]] = 1.
            # print(logits)
            # print(b_labels_logits)
            loss = loss_func(logits,b_labels_logits.type_as(logits))
            loss.backward()
            total_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()


            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            t_tmp_accuracy = flat_accuracy(logits, label_ids)
            t_train_accuracy += t_tmp_accuracy

            if step % log_period == 0 and not step == 0:
                print('  Batch %d  of  %d. Loss: %f Mean Accuracy: %f' % (
                    step, len(train_dataloader), total_loss / step, t_train_accuracy / step))
            if step == 23000:
              break

        avg_train_loss = total_loss / len(train_dataloader)

        loss_values.append(avg_train_loss)
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("")
        print("Running Validation...")
        model.eval()
        
        # write_answers(model, test_dataloader, answers_file, epoch_i * 10 ** 7 + step, stage='test')
        # write_answers(model, test_dataloader, answers_file, epoch_i * 10 ** 7 + step, stage='test')
        # write_answers(model, test_dataloader, answers_file, epoch_i * 10 ** 7 + step, stage='test')
        checkpoint = {
            'epoch': epoch_i,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()}
        torch.save(checkpoint, save_file_template % epoch_i)
        validate(model, validation_dataloader)

        # print('writing for test')
        # write_answers(model, test_dataloader, answers_file, epoch_i, 'test')
        # print('writing for train')
        # train_val_dataloader = get_val_dataloader(TRAIN_DF_PATH, 1, 4)
        # write_answers(model, train_dataloader, answers_file, epoch_i, 'train')
        # print('writing for val')
        # write_answers(model, validation_dataloader, answers_file, epoch_i, 'val')


    print("")
    print("Training complete!")


In [103]:
ANSWERS_FILE = '/content/drive/MyDrive/multiclass_answers_%d_'
MODEL_FILE = '/content/drive/MyDrive/multiclass_model_%d'

train(train_dataloader, val_dataloader, test_dataloader, model, optimizer, scheduler, 4, None,
      answers_file=ANSWERS_FILE, save_file_template=MODEL_FILE, seed=1514,
      log_period=500, checkpoint_file=None)


Training...
  Batch 500  of  14303. Loss: 0.413642 Mean Accuracy: 0.190500
  Batch 1000  of  14303. Loss: 0.402657 Mean Accuracy: 0.218500
  Batch 1500  of  14303. Loss: 0.398551 Mean Accuracy: 0.231333
  Batch 2000  of  14303. Loss: 0.395038 Mean Accuracy: 0.241250
  Batch 2500  of  14303. Loss: 0.393420 Mean Accuracy: 0.245800
  Batch 3000  of  14303. Loss: 0.391899 Mean Accuracy: 0.252833
  Batch 3500  of  14303. Loss: 0.390784 Mean Accuracy: 0.256286
  Batch 4000  of  14303. Loss: 0.389517 Mean Accuracy: 0.260375
  Batch 4500  of  14303. Loss: 0.388821 Mean Accuracy: 0.262556
  Batch 5000  of  14303. Loss: 0.387687 Mean Accuracy: 0.266250
  Batch 5500  of  14303. Loss: 0.386908 Mean Accuracy: 0.268591
  Batch 6000  of  14303. Loss: 0.386250 Mean Accuracy: 0.271208
  Batch 6500  of  14303. Loss: 0.385525 Mean Accuracy: 0.274308
  Batch 7000  of  14303. Loss: 0.384977 Mean Accuracy: 0.276714
  Batch 7500  of  14303. Loss: 0.384476 Mean Accuracy: 0.278467
  Batch 8000  of  14303. Los



  Accuracy: 0.60

Training...
  Batch 500  of  14303. Loss: 0.367794 Mean Accuracy: 0.335000
  Batch 1000  of  14303. Loss: 0.367920 Mean Accuracy: 0.336500
  Batch 1500  of  14303. Loss: 0.367494 Mean Accuracy: 0.339833
  Batch 2000  of  14303. Loss: 0.367138 Mean Accuracy: 0.341625
  Batch 2500  of  14303. Loss: 0.366796 Mean Accuracy: 0.343400
  Batch 3000  of  14303. Loss: 0.366925 Mean Accuracy: 0.342083
  Batch 3500  of  14303. Loss: 0.366606 Mean Accuracy: 0.341429
  Batch 4000  of  14303. Loss: 0.365695 Mean Accuracy: 0.342000
  Batch 4500  of  14303. Loss: 0.365392 Mean Accuracy: 0.343389
  Batch 5000  of  14303. Loss: 0.365057 Mean Accuracy: 0.345400
  Batch 5500  of  14303. Loss: 0.364979 Mean Accuracy: 0.344682
  Batch 6000  of  14303. Loss: 0.365194 Mean Accuracy: 0.344708
  Batch 6500  of  14303. Loss: 0.364957 Mean Accuracy: 0.345154
  Batch 7000  of  14303. Loss: 0.364801 Mean Accuracy: 0.344250
  Batch 7500  of  14303. Loss: 0.364513 Mean Accuracy: 0.345633
  Batch 800

In [None]:
MODEL_FILE = '/content/drive/MyDrive/model_0'
checkpoint = torch.load(MODEL_FILE)
epoch_t = checkpoint['epoch']
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
scheduler.load_state_dict(checkpoint['scheduler'])
print('Loaded from checkpoint')

Loaded from checkpoint




In [None]:

val_dataloader = get_val_dataloader(VAL_DF_PATH, 1, 4)

  if self.run_code(code, result):


In [None]:
validate(model, val_dataloader)

  Accuracy: 0.94


In [None]:
ANSWERS_FILE = '/content/drive/MyDrive/answers_%d_'

In [None]:
write_answers(model, val_dataloader, ANSWERS_FILE, 0, 'val')

[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 

In [None]:
val_df = pd.read_csv(VAL_DF_PATH)

with open((ANSWERS_FILE % 0) + 'val') as f:
  ans = json.load(f)
ans[:10]

  interactivity=interactivity, compiler=compiler, result=result)


[1, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [None]:
val_df['betr_threshold_1'] = ans

In [None]:
VAL_DF_PATH_NEW = '/content/drive/MyDrive/val_with_bert.csv'

In [None]:
val_df.to_csv(VAL_DF_PATH_NEW)

In [None]:
val_df_t = val_df[['betr_threshold_1', 'retweet_count']]
val_df_t.head(50)

Unnamed: 0,betr_threshold_1,retweet_count
0,1,2.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,1.0
5,0,0.0
6,1,4.0
7,0,0.0
8,0,2.0
9,0,0.0


In [None]:
test_dataloader = get_test_dataloader(TEST_DF_PATH, 1, 4)

In [None]:
ANSWERS_FILE = '/content/drive/MyDrive/answers_%d_'
write_answers(model, test_dataloader, ANSWERS_FILE, 0, 'test')

1000/71334
2000/71334
3000/71334
4000/71334
5000/71334
6000/71334
7000/71334
8000/71334
9000/71334
10000/71334
11000/71334
12000/71334
13000/71334
14000/71334
15000/71334
16000/71334
17000/71334
18000/71334
19000/71334
20000/71334
21000/71334
22000/71334
23000/71334
24000/71334
25000/71334
26000/71334
27000/71334
28000/71334
29000/71334
30000/71334
31000/71334
32000/71334
33000/71334
34000/71334
35000/71334
36000/71334
37000/71334
38000/71334
39000/71334
40000/71334
41000/71334
42000/71334
43000/71334
44000/71334
45000/71334
46000/71334
47000/71334
48000/71334
49000/71334
50000/71334
51000/71334
52000/71334
53000/71334
54000/71334
55000/71334
56000/71334
57000/71334
58000/71334
59000/71334
60000/71334
61000/71334
62000/71334
63000/71334
64000/71334
65000/71334
66000/71334
67000/71334
68000/71334
69000/71334
70000/71334
71000/71334
[0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 

In [None]:
test_df = pd.read_csv(TEST_DF_PATH)


In [None]:
TEST_DF_PATH_NEW = '/content/drive/MyDrive/test_with_bert.csv'

In [None]:
with open((ANSWERS_FILE % 0) + 'test') as f:
  ans = json.load(f)
ans[:10]

[0, 0, 0, 1, 0, 0, 0, 1, 1, 0]

In [None]:
test_df['betr_threshold_1'] = ans

In [None]:
test_df.to_csv(TEST_DF_PATH_NEW)

In [None]:
import csv

with open("gbr_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(ans):
        writer.writerow([str(test_df['id'].iloc[index]) , str(int(prediction))])