In [1]:
import pandas as pd

# Embeddings Import
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tf_sentencepiece

# System Utils
import os
import sys
import re
from sklearn.metrics import recall_score,accuracy_score

# Reading Writing Utils
import pandas as pd
import tqdm

# Scientific Computing
import numpy as np

# LOADING THE DATASET

In [3]:
metadata2=pd.read_csv('/home/himaninegi/final_features_full.csv',sep='\t')
metadata2.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,like_ratio,dislike_ratio,views_to_days,duration,comment_count,peaks_count,heightxwid,label,comments_sim_score
0,8,8,0Cu805aSMqY,0.053065,0.008097,12.11811,127.0,56,1.0,0.084689,1.0,0.646691
1,44,44,WzhoYJk0i-g,0.152995,0.242009,2.411111,181.0,26,2.0,0.0111,1.0,0.749086


In [4]:
y = metadata2['label']

x=metadata2.drop(columns=['Unnamed: 0','Unnamed: 0.1', 'id', 'label', 'views_to_days'])
from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y,stratify=y,test_size=0.25,random_state=42)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train,test_size=0.25, random_state=42)

In [5]:
x_train.columns

Index(['like_ratio', 'dislike_ratio', 'duration', 'comment_count',
       'peaks_count', 'heightxwid', 'comments_sim_score'],
      dtype='object')

In [6]:
x_train.shape,x_test.shape,x_val.shape

((1384, 7), (616, 7), (462, 7))

# COLLATE MODEL

In [7]:
import keras
import numpy as np
from keras import initializers
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Flatten, Reshape
from keras.optimizers import Adam
from keras.losses import categorical_crossentropy
from keras import regularizers
from keras import backend as K
from sklearn.metrics import classification_report

from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


class ASL:
    """
        Autoencoder supervised learning class
        initialize class with number of labeled and unlabeled
        must call cnn_setup() or simple_setup() before any training
    """

    def __init__(self, n_samples_train_labeled, n_samples_train_unlabeled, verbose=0):
        self.x_train, self.y_train, self.x_test, self.y_test = x_train, y_train, x_test, y_test

        self.n_labeled = n_samples_train_labeled
        self.n_unlabeled = n_samples_train_unlabeled
        self.verbose = verbose
    
    def simple_setup(self):
        """ data setup for simple single later autoencoder """
  
        self.input_shape = 7
        self.num_classes = 2
        self.regularized_batch_size = 40
        self.basic_batch_size = 25
        self.epochs = 150
        self.verbose = 1

        # convert class vectors to binary class matrices
        self.y_train = keras.utils.to_categorical(self.y_train, self.num_classes)
        self.y_test = keras.utils.to_categorical(self.y_test, self.num_classes)

        self.model_creator = self.create_simple_model

        # train on n labeled data, rest unlabeled
        y_train_pruned = np.copy(self.y_train)
        y_train_pruned[self.n_labeled:,:] = 0

        self.x_train_labeled = self.x_train.iloc[0:self.n_labeled]
        self.y_train_pruned_labeled = y_train_pruned[0:self.n_labeled]
        self.y_train_labeled = self.y_train[0:self.n_labeled]

        self.x_train_unlabeled = self.x_train.iloc[self.n_labeled: self.n_labeled + self.n_unlabeled]
        self.y_train_pruned_unlabeled = y_train_pruned[self.n_labeled: self.n_labeled + self.n_unlabeled]

#         random_ordering = np.random.permutation(self.n_labeled + self.n_unlabeled)
#         self.x_train_all_shuffled = np.concatenate(
#                 (self.x_train_labeled, self.x_train_unlabeled))[random_ordering]
#         self.y_train_all_shuffled = np.concatenate(
#                 (self.y_train_labeled, self.y_train_pruned_unlabeled))[random_ordering]

        self.model_creator = self.create_simple_model
        
    def create_simple_model(self, regularized): # add a Dense layer with a L1 activity regularizer

        visible = Input(shape=(7, ))
        encode = Dense(128, activation='relu',
                    activity_regularizer=regularizers.l1(10e-5))(visible)
        encode = Dense(128, activation='relu',
                    activity_regularizer=regularizers.l1(10e-5))(encode)
        encode = Dense(5, activation='relu',
                    activity_regularizer=regularizers.l1(10e-5))(encode)

        output = Dense(self.num_classes, name='class', activation='softmax')(encode)

        decode = Dense(128, activation='relu',
                    activity_regularizer=regularizers.l1(10e-5))(encode)
        decode = Dense(128, activation='relu',
                    activity_regularizer=regularizers.l1(10e-5))(decode)
        decode = Dense(7, activation='sigmoid',name='reconstruction')(decode)

        if regularized:
            return Model(inputs=visible, outputs=[output, decode])
        else:
            return Model(inputs=visible, outputs=output)
        
    def train_basic_model(self):
        
        model = self.model_creator(False)
        model.summary()

        model.compile(loss='categorical_crossentropy',
                      optimizer=keras.optimizers.Adadelta(),
                      metrics=['accuracy'])

        model.fit(np.copy(self.x_train_labeled),
                  np.copy(self.y_train_labeled),
                  batch_size=self.basic_batch_size,
                  epochs=self.epochs,
                  verbose=self.verbose,
                  validation_data=(self.x_test, self.y_test))

        predictions = model.predict(self.x_test)
        #probs = model.predict_proba(self.x_test)
        pred_class = [np.argmax(p) for p in predictions]
        true_class = [np.argmax(p) for p in self.y_test]

        print("the length of the true_class is",len(true_class))
        print("the lenth of the pred class is",len(pred_class))
        
        print("the lenth of the pred-prob class is",len(predictions))
        
        
        print("\n\n==========================================================")
        print("basic model: " + str(self.n_labeled) + " samples")
        print(classification_report(true_class, pred_class, digits=4))
        
        print("recall-None",recall_score(true_class,pred_class,average=None))
        print("recall-macro ",recall_score(true_class,pred_class,average='macro'))
        print("recall-micro ",recall_score(true_class,pred_class,average='micro'))
        
        score = model.evaluate(self.x_test, self.y_test, verbose=0)
        print(score)
        for metric_name, value in zip(model.metrics_names, score):
            print(metric_name + ":", value)
        return true_class,pred_class,predictions

Using TensorFlow backend.


In [8]:
asl = ASL(1384,1384)
asl.simple_setup()
y_t,y_p,probs=asl.train_basic_model()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 7)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1024      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 645       
_________________________________________________________________
class (Dense)                (None, 2)                 12        
Total params: 18,193
Trainable params: 18,193
Non-trainable params: 0
_________________________________________________________________

Train on 1384 samples, validate on 616 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
E