In [1]:
import tensorflow as tf
from keras.optimizers import adam
from keras.layers.normalization import BatchNormalization
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten,Conv2D, MaxPooling2D,LeakyReLU,GRU,LSTM
from keras.utils import to_categorical, plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from tensorflow.keras.initializers import he_uniform
from keras.constraints import max_norm

Using TensorFlow backend.


In [2]:

#Only for test porpouse.
from gem.utils import graph_util, plot_util
from gem.evaluation import visualize_embedding as viz
from gem.evaluation import evaluate_graph_reconstruction as gr


from gem.embedding.gf       import GraphFactorization
from gem.embedding.hope     import HOPE
from gem.embedding.lap      import LaplacianEigenmaps
from gem.embedding.lle      import LocallyLinearEmbedding
from gem.embedding.node2vec import node2vec
from gem.embedding.sdne     import SDNE

In [3]:
def txt_extract(names_data,path):
    data_list=[]
    for i in names_data:
        sentence = ""
        data_path = path + str(i)+".txt"
        text= open(data_path,'r')
        for x in text:
            sentence+=str(x)
        data_list.append(sentence)
        
        text.close()
    return data_list

In [4]:
def relation_extractor(data):
    full_extraction=[]
    for i in data:
        data_aux=""
        partial_extraction=[]
        data_aux=i.replace("'","")
        data_aux=data_aux.replace("[","")
        data_aux=data_aux.replace("]","")
        data_aux=data_aux.replace("->",",")
        data_aux=data_aux.replace(" ","")
        data_aux=data_aux.split("\n")
        for j in data_aux:
            if 'ROOT' in j:
                continue
            else:
                list_aux= j.split(",")
                partial_extraction.append(list_aux)
        full_extraction.append(partial_extraction)
    return full_extraction
def train_data_transform(data):
    for x in range(len(data)):
        for y in range(len(data[x])):
            for z in range(len(data[x][y])):
                if data[x][y][z]!='':
                    data[x][y][z] = float(data[x][y][z])
                else:
                    data[x][y][z]=0
    return data
def zeros_padding(data):
    largo_data = len(data)
    len_max = max_array(data)
    new_data = np.zeros((largo_data*len_max,6))
    x=0
    for i in range(largo_data):
        for j in range(len(data[i])):
            y=0
            for k in range(len(data[i][j])):
                new_data[x][y]=data[i][j][k]
                
                y+=1
            x+=1
    return new_data
def max_array(data):
    largo_max=0
    for i in data:
        if len(i)>largo_max:
            largo_max=len(i)
        else:
            continue
    return largo_max
def X_data_import(names_data,path):
    X = txt_extract(names_data,path)
    X = relation_extractor(X)
    X = train_data_transform(X)
    X = np.array(X)
    X=zeros_padding(X)
    return X

In [5]:
def txt_original_extract(name):
    original_list=[]
    data=open(name,'r')
    data_read=[line.rstrip('\n') for line in data]
    for x in data_read:
        aux_list=[]
        x_aux=x.split("\t",1)
        original_id=x_aux[0]
        x_aux.remove(original_id)
        x_aux = " ".join(str(w) for w in x_aux)
        aux_list.append(original_id)
        aux_list.append(x_aux)
        original_list.append(aux_list)
    data.close()
    return original_list

In [6]:
def cat_encode(data,cat):
    data=list(data)
    cat=list(cat)
    for i in range(len(data)):
        for j in range(len(cat)):
            if data[i]==cat[j]:
                data[i]=j
                break
            else:
                continue
    return keras.utils.to_categorical(data,len(cat))
    #return np.asarray(data)
def cat_decode(data,cat):
    #Y=data
    Y= [np.argmax(y, axis=None, out=None) for y in data]
    cat=list(cat)
    
    for i in range(len(Y)):
        Y[i]=cat[Y[i]]
    return Y

In [7]:
def auto_predict(data_x,data_set,cat,example_show):
    pred = model.predict(data_x)
    y_test = cat_decode(pred,cat)
    data_set['Expected']= y_test
    data_set.to_csv('sample_submission_1234.csv',columns=['Id','Expected'],index=False)
    if example_show==True:
        return print("Exito!\n",y_test[0:20])
    else:
        return print("Exito")

## Loading data

In [8]:
train_labels = pd.read_csv('./data/train_labels.csv')
test_labels = pd.read_csv('./data/sample_submission.csv')
train_message_path= "./data/train_source_tweets.txt"
train_path = './data/train/'
test_path = './data/test/'

###########        This creates original df with original message and label      ##################33
x_original = txt_original_extract(train_message_path)
df_original = pd.DataFrame(x_original,  columns =['id','original message'])
df_original['id']= df_original['id'].astype('int64')
df_train = pd.merge(df_original, train_labels, how='inner', left_on='id', right_on='id')
Y_train = df_train["label"]
df_train.head(5)

##############################################################################33

Unnamed: 0,id,original message,label
0,692735698349199360,north korea 'planning some kind of rocket laun...,non-rumor
1,525008463819464704,"meet kevin vickers, the hero who shot down the...",unverified
2,505611045897924608,"15 year old who ""swatted"" gamer convicted of d...",false
3,693466724822323200,audio recordings reveal cpr started 11 minutes...,non-rumor
4,510922415468449792,awful mt @scclemons:uk aid worker david haines...,true


### Extracting train data

In [9]:
train_message = df_train["original message"].values
train_relations = X_data_import(df_train["id"],train_path )
y_train = df_train["label"]
cat = Y_train.unique()
Y_train=cat_encode(Y_train,cat)
input_dim=train_relations.shape

In [10]:
train_relations[4]

array([5.40261200e+06, 6.92735698e+17, 0.00000000e+00, 7.87012000e+05,
       6.92735698e+17, 6.30000000e-01])

In [22]:
df_relations = pd.DataFrame(data=train_relations,columns=['P_uid', 'P_tweet_ID',"P_time", 'C_uid', 'C_tweet_ID',"C_time" ])
df_relations.head(5)

Unnamed: 0,P_uid,P_tweet_ID,P_time,C_uid,C_tweet_ID,C_time
0,5402612.0,6.927357e+17,0.0,14614290.0,6.927357e+17,0.63
1,5402612.0,6.927357e+17,0.0,154702800.0,6.927357e+17,0.63
2,5402612.0,6.927357e+17,0.0,1172824000.0,6.927357e+17,0.63
3,5402612.0,6.927357e+17,0.0,1591803000.0,6.927357e+17,0.63
4,5402612.0,6.927357e+17,0.0,787012.0,6.927357e+17,0.63


## Sample of train data

In [19]:
train_relations.shape

(6057270, 6)

# Model

In [25]:
model = Sequential()

# The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
model.add(GRU(128, return_sequences=True,input_shape=(None,7)))

# The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
#model.add(SimpleRNN(128))

#model.add(Dense(4,activation='softmax'))

model.summary()


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_3 (GRU)                  (None, None, 128)         52608     
Total params: 52,608
Trainable params: 52,608
Non-trainable params: 0
_________________________________________________________________


# Compile

In [26]:
####################################        NUEVO                    #######################################
#Nota: Para el tercer modelo (test) con data augmentation
    
    
#########################################           CALLBACKS           #########################################
early_stop = EarlyStopping(monitor='val_categorical_accuracy', patience=12)

check_point = ModelCheckpoint(
    filepath='/tmp/checkpoint',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True,    
    verbose=1
)

reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.85, patience=7, verbose=0, mode='auto', cooldown=0, min_lr=0.00006
)

#########################################           OPTIMIZER           #########################################
opt=tf.keras.optimizers.Nadam()

########################################           LOSS FUCTION           #######################################

#########################################           Metrics           #########################################


In [27]:
model.compile(loss="categorical_crossentropy",optimizer=opt,metrics=["categorical_accuracy"])
train = model.fit(train_relations,Y_train,epochs=20,validation_split=0.3,batch_size=128,shuffle=True)

InvalidArgumentError: indices[1926] = 1926 is not in [0, 1926) [Op:GatherV2]

# Results

In [None]:
print(input_dim)

In [None]:
len(train_relations[3])

In [None]:
a = np.zeros((4,4))
len(a[0])