In [None]:
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer, TFAutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import nltk                                         
from nltk.corpus import stopwords                   
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression

In [None]:
train = pd.read_csv('../input/janatahack-independence-day-2020-ml-hackathon/train.csv')
test = pd.read_csv('../input/janatahack-independence-day-2020-ml-hackathon/test.csv')
ss = pd.read_csv('../input/janatahack-independence-day-2020-ml-hackathon/sample_submission_UVKGLZE.csv')

In [None]:
train['ABSTRACT'] = train['ABSTRACT'].apply(lambda x: x.replace('\n','. ').lower())
test['ABSTRACT'] = test['ABSTRACT'].apply(lambda x: x.replace('\n','. ').lower())

In [None]:
train['ABSTRACT'] = train['TITLE'] + ' <join>' + train['ABSTRACT']
test['ABSTRACT'] = test['TITLE'] + ' <join>' + test['ABSTRACT']

In [None]:
import nltk
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))

In [None]:
def remStop(s):
    sent = ""
    for word in s.split(' '):
        if word not in stopwords:
            sent += word + ' '
    return sent

In [None]:
train['ABSTRACT'] = train['ABSTRACT'].apply(remStop)
test['ABSTRACT'] = test['ABSTRACT'].apply(remStop)

In [None]:
train.head()

In [None]:
train['Quantitative Finance'].value_counts()

In [None]:
train['ABSTRACT'][10]

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tpu.master()

In [None]:
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

In [None]:
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-large-mnli')

In [None]:
def encode(text,tokenizer):
    encoded = tokenizer.batch_encode_plus(
        text,
        return_attention_masks = False,
        return_token_type_ids = False,
        pad_to_max_length = True,
        max_length = 200
    )
    return np.array(encoded['input_ids'])

In [None]:
cols = ['Computer Science', 'Physics', 'Mathematics', 
        'Statistics', 'Quantitative Biology', 'Quantitative Finance']

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
EPOCHS = 4
BATCH_SIZE = 32 * tpu_strategy.num_replicas_in_sync
MAX_LEN = 200
MODEL = 'distilbert-base-uncased-finetuned-sst-2-english'

In [None]:
test

In [None]:
train

In [None]:
transformer = TFAutoModel.from_pretrained('roberta-large-mnli')

In [None]:
for col in cols:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tpu.master()

    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)

    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    
    X_train = encode(train.ABSTRACT.values,tokenizer)
    X_test = encode(test.ABSTRACT.values,tokenizer)
    y_train = train[col].values
    
    train_data = (
    tf.data.Dataset
    .from_tensor_slices((X_train,y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
             )

    test_data = (
        tf.data.Dataset
        .from_tensor_slices(X_test)
        .batch(BATCH_SIZE)
    )
    
    with tpu_strategy.scope():
        transformer = TFAutoModel.from_pretrained('roberta-large-mnli')
        input_word_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
        sequence_output = transformer(input_word_ids)[0]
        cls_token = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(cls_token)
        model = Model(inputs=input_word_ids, outputs=out)
        model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

    with tpu_strategy.scope():
        n_steps = X_train.shape[0] // BATCH_SIZE
        t_h = model.fit(train_data,steps_per_epoch = n_steps,
                        epochs = 8)
    ss[col] = model.predict(test_data, verbose = 1)

In [None]:
ss

In [None]:
#ss.iloc[:,1:] = np.round(ss.iloc[:,1:])
ss.to_csv('submission.csv',index=False)