<a href="https://colab.research.google.com/github/Hadjadj-Benakmoum/MobilityPredictionUsingML/blob/main/LSTM(RNN)_Test_Scenario_(1%2C2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout,LSTM
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import model_to_dot
from keras.utils.vis_utils import plot_model

from sklearn.model_selection import train_test_split

import numpy as np
import time
import os


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Copy the dataset in the reuntime and unzip it

In [None]:
#Copy/Unzip files from Google drive to accelrate the training 
#!cp -r '/content/drive/My Drive/movement.tar.gz' '/content/movement.tar.gz'
#!tar xvzf '/content/movement.tar.gz'

# Script parameters

In [None]:
#Test configuration

window = 8

train_percentage=0.25

train_percentage_users = 0.50

threshold_lenght = 1000

method = 'RNN'

path='/content/drive/My Drive/mob pf'

embedding = False
embedding_size = 20
nb_epochs = 20

batch_size = 256
hidden_lstm=True
lstm_nb_units = 16
hidden_dense=True
dense_nb_unit = 30

test_configuration ='Scenario:{0} W:{1} Method:{2} embedding:{3} embedding_size:{4} nb_epochs:{5}.csv'.format(1,window,method,embedding,embedding_size,nb_epochs)
print('Test strategy : ', test_configuration)

Test strategy :  Scenario:1 W:8 Method:RNN embedding:False embedding_size:20 nb_epochs:20.csv


# Read data from mobility file Function

In [None]:
def read_location_file(path):
  fichier = open(path, 'r')
  lines = fichier.readlines()
  fichier.close()
  locations_seq = [line.split()[1] for line in lines]
  return locations_seq

# Preprocessing of data (Encode lcoation and prepare thel for the model)

In [None]:
# Encode locations as integer sequence
def locations_to_integers(locations_seq,tokenizer=None):
  if(tokenizer==None):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(locations_seq)
  encoded_seq = tokenizer.texts_to_sequences(locations_seq)
  # Convert a list of lists to a flat list 
  encoded_seq  = [val for sublist in encoded_seq for val in sublist]
  return encoded_seq,tokenizer

In [None]:
def build_X_Y(encoded_seq,window=window):
  temp_seqs = np.zeros((len(encoded_seq),window+1), dtype='int32')
  for i in range(len(encoded_seq)-window):
    temp_seqs[i] = encoded_seq[i:(i+window+1)]
  X = temp_seqs[:,:-1]
  y = temp_seqs[:,-1]
  return X,y

In [None]:
def integer_to_vector(X_train, X_test, y_train, y_test,Embedding=embedding):
  if(not Embedding):
    X_train = to_categorical(X_train,num_classes=vocabulary_size)
    X_test = to_categorical(X_test,num_classes=vocabulary_size)
  y_train = to_categorical(y_train,num_classes=vocabulary_size)
  y_test = to_categorical(y_test,num_classes=vocabulary_size)
  return X_train, X_test, y_train, y_test

# Function to build the graph of the RNN Model

In [None]:

def build_model(vocabulary_size,
                window = window,
                embedding = embedding,
                embedding_size = embedding_size,
                hidden_lstm = hidden_lstm,
                lstm_nb_units = lstm_nb_units,
                dense_nb_unit = dense_nb_unit,
                hidden_dense = hidden_dense):
  
  time_step = window

  model = Sequential()
  if(embedding):
    model.add(Embedding(vocabulary_size, embedding_size, input_length=time_step))
    model.add(LSTM(lstm_nb_units,activation='relu',return_sequences=True))
  else:
    model.add(LSTM(lstm_nb_units,activation='relu',input_shape=(time_step,vocabulary_size),return_sequences=True))

  if(hidden_lstm):
    model.add(LSTM(lstm_nb_units,activation='relu'))
    
  if(hidden_dense):
    model.add(Dense(20,activation='relu'))

  model.add(Dense(vocabulary_size, activation='softmax'))

  # compiling the network
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

# Function to save results

In [None]:
import csv
def save_results_csv(output_file,users_files,Corrects,Incorrects,Accuracies):
  with open(output_file, 'w') as csvfile:
      header = ['users_files','Corrects','Incorrects','Accuracies']
      writer = csv.writer(csvfile, delimiter=',')
      writer.writerow(header)
      outputs = zip(users_files,Corrects,Incorrects,Accuracies)
      writer.writerows(outputs)

# **Test Scenario 1** : Train/Test in each user

In [None]:
Accuracies=[]
Corrects=[]
Incorrects=[]

users_files =  os.listdir(path)
print('Number of files :',len(users_files))
for filename in users_files:
    if filename.endswith("mv"):

      #Read sequence
      locations = read_location_file(os.path.join(path,filename))
      sequence_lenght = len(locations)

      #Ignore sequence if the lenght is less than threshold_lenght
      if(sequence_lenght <threshold_lenght):
        continue

      #Preprocessing
      encoded_locations,tokenizer = locations_to_integers(locations)
      vocabulary_size = len(tokenizer.word_counts)+1

      #Split the sequence Train/Test
      X,y = build_X_Y(encoded_locations,window)
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_percentage, shuffle = False)
      X_train, X_test, y_train, y_test = integer_to_vector(X_train, X_test, y_train, y_test,embedding)

      #Train/Test the RNN model 
      model = build_model(vocabulary_size=vocabulary_size)
      model.fit(X_train,
                y_train,
                batch_size=batch_size,
                epochs=nb_epochs,
                validation_data=(X_test,y_test ),
                verbose=0)

      #Test the Model 
      user_accuracy = model.evaluate(X_test,y_test)[1]
      user_correct = int(user_accuracy*sequence_lenght)
      user_incorrect = sequence_lenght - user_correct

      #Collect results
      Corrects.append(user_correct)
      Incorrects.append(user_incorrect)
      Accuracies.append(user_accuracy)
      
      #Display results
      print('User : ', filename)
      print('User lenght : ',len(locations))
      print('Correct : ',user_correct )
      print('Incorrect : ',user_incorrect)
      print('Accuracy : ',user_accuracy)
      print("======================================================================================================")

#Save final results in csv file where the name contains information about the test (w,method ....). The name of file is : Configuration.csv
save_results_csv(test_configuration,users_files,Corrects,Incorrects,Accuracies)

print('Global results : ')
print("======================================================================================================")
print('Corrects : ',sum(Corrects))
print('Incorrects : ',sum(Incorrects))
print('Average Acc : ',sum(Accuracies)/len(Accuracies))

Number of files : 1001
User :  227.mv
User lenght :  1402
Correct :  47
Incorrect :  1355
Accuracy :  0.03423680365085602
User :  268.mv
User lenght :  1628
Correct :  234
Incorrect :  1394
Accuracy :  0.1437346488237381
User :  386.mv
User lenght :  1672
Correct :  713
Incorrect :  959
Accuracy :  0.4270334839820862
User :  313.mv
User lenght :  1682
Correct :  537
Incorrect :  1145
Accuracy :  0.31985729932785034
User :  465.mv
User lenght :  15647
Correct :  13271
Incorrect :  2376
Accuracy :  0.8481594920158386
User :  416.mv
User lenght :  1045
Correct :  1
Incorrect :  1044
Accuracy :  0.001912045874632895
User :  520.mv
User lenght :  1425
Correct :  605
Incorrect :  820
Accuracy :  0.42496493458747864
User :  47.mv
User lenght :  1774
Correct :  393
Incorrect :  1381
Accuracy :  0.22209694981575012
User :  115.mv
User lenght :  3264
Correct :  643
Incorrect :  2621
Accuracy :  0.1973039209842682
User :  427.mv
User lenght :  2322
Correct :  419
Incorrect :  1903
Accuracy :  0.1

# **Test Scenario 2** : Train on users and Test in other users

In [None]:
test_configuration ='Scenario:{0} W:{1} Method:{2} embedding:{3} embedding_size:{4} nb_epochs:{5}.csv'.format(2,window,method,embedding,embedding_size,nb_epochs)
print('Test strategy : ', test_configuration)

Test strategy :  Scenario:2 W:8 Method:RNN embedding:False embedding_size:20 nb_epochs:20.csv


In [None]:
def Split_sequence(sequence,train_percentage):
  split_index = int(train_percentage * len(sequence))
  train_sequence = sequence[:split_index]
  test_sequence  = sequence[split_index:]
  return train_sequence,test_sequence,

In [None]:
def integer_to_vector_1(X_train, y_train,vocabulary_size,Embedding=embedding):
  if(not Embedding):
    X_train = to_categorical(X_train,num_classes=vocabulary_size)
  y_train = to_categorical(y_train,num_classes=vocabulary_size)
  return X_train, y_train

In [None]:
list_of_files=os.listdir(path)

#Prepare tokenizer used to code location as integers 
all_locations=[]
for filename in list_of_files:
    if filename.endswith("mv"):
        all_locations.append(read_location_file(os.path.join(path,filename)))
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_locations)

#Split Data
train_locations,test_locations =  Split_sequence(all_locations,train_percentage_users)
_,test_user_names =  Split_sequence(list_of_files,train_percentage_users)

#Training a Global Model
print('Start model training')

vocabulary_size = len(tokenizer.word_counts)+1

#Build a global Model
global_model = build_model(vocabulary_size=vocabulary_size)
i=1
for user_locations in train_locations:
  
  #Preprocessing
  encoded_locations,_ = locations_to_integers(user_locations,tokenizer)
  X,y = build_X_Y(encoded_locations,window)
  X_train,y_train= integer_to_vector_1(X, y,vocabulary_size,embedding)
  
  #Train/Test the RNN model     
  global_model.fit(X_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=nb_epochs,
                  validation_data=(X_train,y_train),
                  verbose=0
                  )

  print('Model trained on {0} users'.format(i))
  i=i+1

print('End model training')

Start model training
Model trained on 1 users
Model trained on 2 users
Model trained on 3 users
Model trained on 4 users
Model trained on 5 users
Model trained on 6 users
Model trained on 7 users
Model trained on 8 users
Model trained on 9 users
Model trained on 10 users
Model trained on 11 users
Model trained on 12 users
Model trained on 13 users
Model trained on 14 users
Model trained on 15 users
Model trained on 16 users
Model trained on 17 users
Model trained on 18 users
Model trained on 19 users
Model trained on 20 users
Model trained on 21 users
Model trained on 22 users
Model trained on 23 users
Model trained on 24 users
Model trained on 25 users
Model trained on 26 users
Model trained on 27 users
Model trained on 28 users
Model trained on 29 users
Model trained on 30 users
Model trained on 31 users
Model trained on 32 users
Model trained on 33 users
Model trained on 34 users
Model trained on 35 users
Model trained on 36 users
Model trained on 37 users
Model trained on 38 users


In [None]:
Accuracies=[]
Corrects=[]
Incorrects=[]
Undefined=[]
Shortdays=[]
ind = 0
for locations in test_locations :

    sequence_lenght = len(locations)
    #Ignore sequence if the lenght is less than threshold_lenght
    if(sequence_lenght <threshold_lenght):
      continue

    #Preprocessing
    encoded_locations,_ = locations_to_integers(locations,tokenizer)

    #Split the sequence Train/Test
    vocabulary_size = len(tokenizer.word_counts)+1
    X,y = build_X_Y(encoded_locations,window)
    X_test, y_test = integer_to_vector_1(X, y,vocabulary_size,embedding)

    #Test the Model 
    loss,user_accuracy = global_model.evaluate(X_test,y_test)
    print(loss)
    user_correct = int(user_accuracy*sequence_lenght)
    user_incorrect = sequence_lenght - user_correct

    #Collect results
    Corrects.append(user_correct)
    Incorrects.append(user_incorrect)
    Accuracies.append(user_accuracy)
    
    #Display results
    print('User : ',test_user_names[ind])
    print('User lenght : ',len(locations))
    print('Correct : ',user_correct )
    print('Incorrect : ',user_incorrect)
    print('Accuracy : ',user_accuracy)
    print("======================================================================================================")
    ind +=1

#Save final results in csv file where the name contains information about the test (w,method ....). The name of file is : Configuration.csv
save_results_csv(test_configuration,users_files,Corrects,Incorrects,Accuracies)

print('Global results : ')
print("======================================================================================================")
print('Corrects : ',sum(Corrects))
print('Incorrects : ',sum(Incorrects))
print('Average Acc : ',sum(Accuracies)/len(Accuracies))

4.337094783782959
User :  908.mv
User lenght :  2090
Correct :  192
Incorrect :  1898
Accuracy :  0.09186603128910065
5.050904273986816
User :  840.mv
User lenght :  3085
Correct :  391
Incorrect :  2694
Accuracy :  0.1270664483308792
3.987807035446167
User :  737.mv
User lenght :  5063
Correct :  426
Incorrect :  4637
Accuracy :  0.08413983881473541
3.5764760971069336
User :  792.mv
User lenght :  3874
Correct :  57
Incorrect :  3817
Accuracy :  0.014713474549353123
4.070503234863281
User :  852.mv
User lenght :  3551
Correct :  763
Incorrect :  2788
Accuracy :  0.21486905217170715
11.255861282348633
User :  870.mv
User lenght :  1103
Correct :  23
Incorrect :  1080
Accuracy :  0.02175883948802948
5.749634742736816
User :  922.mv
User lenght :  1975
Correct :  430
Incorrect :  1545
Accuracy :  0.21772152185440063
2.315044641494751
User :  828.mv
User lenght :  2159
Correct :  895
Incorrect :  1264
Accuracy :  0.41500693559646606
5.122743129730225
User :  974.mv
User lenght :  1623
Cor

In [None]:
print(len(test_locations))

500


In [None]:
print(global_model.evaluate(X_test,y_test))

[4.478689670562744, 0.15000000596046448]


In [None]:
X_test.shape

(2020, 8, 556)

In [None]:
X_test[0,5]=1

In [None]:
print(global_model.evaluate(X_test,y_test))

[4.744957447052002, 0.15000000596046448]


In [None]:
print(X_test[0:5,0:5])

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [None]:
print(X[0:5,0:5])

[[24 36  7 24 36]
 [36  7 24 36  7]
 [ 7 24 36  7 24]
 [24 36  7 24 36]
 [36  7 24 36 24]]


In [None]:
print(global_model.evaluate(np.zeros((4257,8))+100,y_test))

ValueError: ignored

In [None]:
print(global_model)

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7f55fc0f8a20>
