In [1]:
from numpy.random import seed
import random
seed(1)
import tensorflow as tf
tf.random.set_seed(2)

In [2]:
import nltk
import re
import pandas as pd
import numpy as np
import os
from pathlib import Path
from scipy import stats
from tensorflow.keras.layers import Input, Dense, Lambda

In [3]:
import keras.backend as K
import numpy as np
import tensorflow as tf

def log10(x):
  numerator = K.log(x)
  denominator = K.log(K.constant(10, dtype=numerator.dtype))
  return numerator / denominator

def gaussian_nll(ytrue, ypreds):
    """Keras implmementation of multivariate Gaussian negative loglikelihood loss function. 
    This implementation implies diagonal covariance matrix.
    
    Parameters
    ----------
    ytrue: tf.tensor of shape [n_samples, n_dims]
        ground truth values
    ypreds: tf.tensor of shape [n_samples, n_dims*2]
        predicted mu and logsigma values (e.g. by your neural network)
        
    Returns
    -------
    neg_log_likelihood: float
        negative loglikelihood averaged over samples
        
    This loss can then be used as a target loss for any keras model, e.g.:
        model.compile(loss=gaussian_nll, optimizer='Adam') 
    
    """
    
    n_dims = int(int(ypreds.shape[1])/2)
    mu = ypreds[:, 0:n_dims]
    logsigma = ypreds[:, n_dims:]
    
    mse = 0.5*K.sum(K.square(ytrue-mu)/(logsigma),axis=1)
    sigma_trace = 0.5*K.sum(log10(logsigma), axis=1)
    log2pi = 0.5*n_dims*np.log(2*np.pi)
    
    log_likelihood = mse+sigma_trace+log2pi

    return K.mean(log_likelihood)

Using TensorFlow backend.


In [4]:
test_eval_data = "/Users/zxj/Desktop/study/semester3/MCS/humor/data/task-1/data/task-1/test_eval.csv"

test_eval=pd.read_csv(test_eval_data)
test_eval_label1=test_eval.meanGrade
test_eval_label2=test_eval.grades
#get orignial news
test_eval_news=test_eval.original
test_eval_funny_word=test_eval.edit

In [5]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
tt=nltk.tokenize.regexp.WordPunctTokenizer()
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()

punctuation = '!,;:?"\'.\'/<>'
def removePunctuation(text):
    text = re.sub(r'[{}]+'.format(punctuation),'',text)
    return text.strip()

#catch the changed word
def find_tag(news):
    p = re.compile(r'[<](.*?)/[>]', re.S)
    return re.findall(p, news)

def preprocess_news(sentence,funny_word):
    final=[]
    num=0
    bad_sign=["’","‘","-"]
    
    for event in sentence:
        lines=[]       
        event=str(event)
        changed_word=find_tag(event)   
        event=removePunctuation(event)
        event=tt.tokenize(event)
        #tokenize and remove Punctuation
        
        for i in event:
            if i not in bad_sign:
                
                if i in changed_word:
                    #add the funny behind the changed word
                    #lines.append(i)
                    lines.append(funny_word[num])
                else:
                    lines.append(i)
                    
        lines=' '.join(lines)
        lines='[CLS]'+' '+lines+' '+'[SEP]'
        final.append(lines)
        num+=1
    return final

In [17]:
#get mean and varience for each headline
test_eval_label=[]
test_eval_label3=[]
for i in range(0,len(test_eval_label1)):
    label1=test_eval_label1[i]
    label2=str(test_eval_label2[i])
    mean=label1
    
    def get_grade(label):
        grade=[]
        for i in label:
            grade.append(int(i))
        return grade
    
    grade=get_grade(label2)
    
    #get varience 
    def get_var(grade,mean):
        Var=0
        for i in grade:
            var=np.square(grade-mean)
            Var+=var
        return np.mean(Var)
    
    var=np.sqrt(get_var(grade,mean))
    test_eval_label3.append(float(var))
    test_eval_label.append([mean,var])

In [7]:
test_eval_news=preprocess_news(test_eval_news,test_eval_funny_word)

In [8]:
test_eval_news[0]

'[CLS] The Latest Election tally shows Cars turning right [SEP]'

In [9]:
from transformers import AlbertTokenizer, TFAlbertModel
#import tensorflow as tf
import numpy as np
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2',token_ids_1=None)
inputs_train = tokenizer(test_eval_news,padding=True,return_tensors="pt")
inputs_train=np.array(inputs_train['input_ids'])

In [30]:
input_layer=tf.keras.Input(shape=(37,),dtype='int64')
bert=TFAlbertModel.from_pretrained('albert-base-v2', return_dict=True)(input_layer)
layer1=bert[1]
regression=tf.keras.layers.Dense(1)(layer1)
#result=Lambda(negative_binomial_layer)(regression)
model=tf.keras.Model(inputs=input_layer,outputs=regression)
model.summary()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing TFAlbertModel: ['predictions']
- This IS expected if you are initializing TFAlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFAlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the model checkpoint at albert-base-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 37)]              0         
_________________________________________________________________
tf_albert_model_1 (TFAlbertM TFBaseModelOutputWithPool 11683584  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 769       
Total params: 11,684,353
Trainable params: 11,684,353
Non-trainable params: 0
_________________________________________________________________


In [39]:
opt=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0)
metric = tf.keras.metrics.MeanSquaredError()
model.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError(),metrics=[metric])
model_fit = model.fit(inputs_train, np.array(test_eval_label3),
                      batch_size=1, epochs=10,
                      validation_data=(inputs_train[:100],np.array(test_eval_label3[:100]))
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
def predict_prob(model, x, batch_size=2048):
    """Make predictions given model and 2d data
    """

    ypred = model.predict(x, batch_size=batch_size, verbose=1)
    n_outs = int(ypred.shape[1] / 2)
    mean = ypred[:, 0:n_outs]
    sigma = np.exp(ypred[:, n_outs:])

    return mean, sigma

In [37]:
def evaluation(test,result):
    pre=model.predict(test)
    pres=[]
    for i in pre:
        #a=float(i)*3
        pres.append(i)
    print(pres)
    rmse = np.sqrt(np.mean((np.array(result) - np.array(pres))**2))

    
    return rmse

In [41]:
evaluation(inputs_train[:100],test_eval_label3[:100])

[array([1.259876], dtype=float32), array([1.2499784], dtype=float32), array([1.2339311], dtype=float32), array([1.2576215], dtype=float32), array([1.2453129], dtype=float32), array([1.2411587], dtype=float32), array([1.2606369], dtype=float32), array([1.2428229], dtype=float32), array([1.2592647], dtype=float32), array([1.2514399], dtype=float32), array([1.2540429], dtype=float32), array([1.2495706], dtype=float32), array([1.2440271], dtype=float32), array([1.2461654], dtype=float32), array([1.2555704], dtype=float32), array([1.2469335], dtype=float32), array([1.253432], dtype=float32), array([1.2503792], dtype=float32), array([1.245656], dtype=float32), array([1.2538016], dtype=float32), array([1.2523233], dtype=float32), array([1.2451394], dtype=float32), array([1.255726], dtype=float32), array([1.2542164], dtype=float32), array([1.2459944], dtype=float32), array([1.2579632], dtype=float32), array([1.2486812], dtype=float32), array([1.2513736], dtype=float32), array([1.2455356], dtyp

0.7194494330720006

In [40]:
np.array(test_eval_label1[:100])

array([1.2, 0.4, 1. , 1.6, 0.4, 0.6, 1.6, 1.2, 1. , 0.4, 1.6, 0.6, 0.2,
       0.2, 1.8, 0.2, 0.4, 1.4, 0. , 2.2, 1. , 0.8, 1.2, 0.6, 2.2, 0.8,
       0.6, 1. , 0.4, 0.6, 0.2, 0.8, 1.6, 1.2, 2. , 1. , 0.6, 0. , 0.4,
       0.8, 0.6, 0.8, 0.8, 0.6, 1. , 1. , 1.2, 0.8, 2. , 1. , 1.6, 1.6,
       1.2, 0.2, 1. , 0.4, 0.4, 0. , 0.8, 0.6, 1. , 1. , 1.8, 1.2, 0.2,
       0.6, 0.8, 0. , 1.4, 0.6, 0.6, 0.6, 1.2, 1.2, 1. , 0. , 1.6, 1.4,
       1.4, 0.2, 1.2, 0.4, 1.2, 0.4, 1. , 0.4, 0.2, 0. , 1.2, 0.4, 0.6,
       0.4, 2. , 0.6, 1.9, 1.2, 0.8, 1. , 1.8, 1.8])