<a href="https://colab.research.google.com/github/LevWilliams/PoliticalClassificationPOSTBERT/blob/master/DifferentTransformerPolyDat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Packages & Data

In [None]:
## importing packages
import gc
import os
import random
!pip install transformers
import transformers
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K

from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFAutoModel

print(f"TensorFlow version: {tf.__version__}")
print(f"Transformers version: {transformers.__version__}")

warnings.filterwarnings("ignore")

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 12.8MB/s eta 0:00:01[K     |▉                               | 20kB 1.8MB/s eta 0:00:01[K     |█▎                              | 30kB 2.2MB/s eta 0:00:01[K     |█▊                              | 40kB 2.4MB/s eta 0:00:01[K     |██▏                             | 51kB 2.0MB/s eta 0:00:01[K     |██▋                             | 61kB 2.3MB/s eta 0:00:01[K     |███                             | 71kB 2.4MB/s eta 0:00:01[K     |███▍                            | 81kB 2.6MB/s eta 0:00:01[K     |███▉                            | 92kB 2.8MB/s eta 0:00:01[K     |████▎                           | 102kB 2.8MB/s eta 0:00:01[K     |████▊                           | 112kB 2.8MB/s eta 0:00:01[K     |█████▏                          | 122kB 2.8M

In [None]:
from google.colab import auth
auth.authenticate_user()
DataDir = "gs://lgc_models_exp/data/ExtractedTweets.csv"
!mkdir -p Data
cmd = "gsutil cp " + DataDir + " " + "Data/PLData.csv"
!$cmd

Copying gs://lgc_models_exp/data/ExtractedTweets.csv...
/ [1 files][ 13.0 MiB/ 13.0 MiB]                                                
Operation completed over 1 objects/13.0 MiB.                                     


Configuration

In [None]:
## defining configuration
class Configuration():
    """
    All configuration for running an experiment
    """
    def __init__(
        self,
        model_name,
        max_length = 64,
        padding = True,
        batch_size = 128,
        epochs = 5,
        metrics = ["sparse_categorical_accuracy"],
        verbose = 1,
        train_splits = 5,
        accelerator = "TPU",
        myluckynumber = 13
    ):
        # seed and accelerator
        self.SEED = myluckynumber
        self.ACCELERATOR = accelerator

        # paths
        self.PATH_TRAIN = Path("/content/Data/PLData.csv")
        #self.PATH_TEST  = Path("test.csv.zip")

        # splits
        self.TRAIN_SPLITS = train_splits

        # mapping of language
        self.PARTY_MAP = {
            "Republican"   : 0,
            "Democrat"   : 1,
        }

        self.INVERSE_PARTY_MAP = {v: k for k, v in self.PARTY_MAP.items()}

        # model configuration
        self.MODEL_NAME = model_name
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_NAME)

        # model hyperparameters
        self.MAX_LENGTH = max_length
        self.PAD_TO_MAX_LENGTH = padding
        self.BATCH_SIZE = batch_size
        self.EPOCHS = epochs
        self.METRICS = metrics
        self.VERBOSE = verbose
        
        # initializing accelerator
        self.initialize_accelerator()
        
    def initialize_accelerator(self):
        """
        Initializing accelerator
        """
        # checking TPU first
        if self.ACCELERATOR == "TPU":
            print("Connecting to TPU")
            try:
                tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
                print(f"Running on TPU {tpu.master()}")
            except ValueError:
                print("Could not connect to TPU")
                tpu = None

            if tpu:
                try:
                    print("Initializing TPU")
                    tf.config.experimental_connect_to_cluster(tpu)
                    tf.tpu.experimental.initialize_tpu_system(tpu)
                    self.strategy = tf.distribute.experimental.TPUStrategy(tpu)
                    self.tpu = tpu
                    print("TPU initialized")
                except _:
                    print("Failed to initialize TPU")
            else:
                print("Unable to initialize TPU")
                self.ACCELERATOR = "GPU"

        # default for CPU and GPU
        if self.ACCELERATOR != "TPU":
            print("Using default strategy for CPU and single GPU")
            self.strategy = tf.distribute.get_strategy()

        # checking GPUs
        if self.ACCELERATOR == "GPU":
            print(f"GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")

        # defining replicas
        self.AUTO = tf.data.experimental.AUTOTUNE
        self.REPLICAS = self.strategy.num_replicas_in_sync
        print(f"REPLICAS: {self.REPLICAS}")