<a href="https://colab.research.google.com/github/Naii-the-Baf/colab/blob/main/Proy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import copy
import numpy as np
import pandas as pd
import random
import string

from sklearn.model_selection import train_test_split

In [None]:
#Custom hash function for testing.
def XORHash(key : str):
  key_arr = bytearray()
  key_arr.extend(key.encode("UTF-8"))
  alnum_str = list(string.digits + "abcdef")
  output = []
  for i in range(len(key_arr)):
    byte_xor = key_arr[i] ^ key_arr[-i] ^ key_arr[i - 1]
    output.append(byte_xor)
  str_out = "".join((alnum_str[ch % len(alnum_str)] for ch in output))
  return str_out

In [None]:
alnum_str = list(string.digits + "abcdef")
print(alnum_str)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']


In [None]:
#Collision in the hash function
print(XORHash("e3"), XORHash("ec"))

35 35


In [None]:
def getRandomString (length = 1, alphabet = string.ascii_lowercase):
  return "".join(random.choice(alphabet) for i in range(length))

In [None]:
#This creates (or overwrites) a file containing a hash table.
def writeDataset(filename = "./hashdb.csv", length = 1000):
  csvfile = open(filename, "w")
  alnum_str = list(string.digits + "abcdef")

  for i in range(length):
      input = getRandomString(5, alnum_str)
      csvfile.write(f"{XORHash(input)},{input}\n")

  csvfile.close()

In [None]:
writeDataset(length=300000)

In [None]:
#Converts a string into an array of numerical values.
def textToFeatures(string, alphabet):
  out_array = np.zeros(len(string), dtype=np.int32)
  for i in range(len(string)):
    out_array[i] = alphabet.index(string[i])
  return out_array

In [None]:
#Converts an array of numerical values into a string.
def valuesToText(array, alphabet):
  return "".join((alphabet[char] for char in array))

In [None]:
#Looks for a dataset and loads it. Returns a pandas DataFrame.
#The read file is expected to be a hash table with two columns.
def readDataset(filename = "./hashdb.csv"):
  return pd.read_csv(filename, header=None, dtype=str)

In [None]:
dataset = readDataset()
string_len = len(dataset[0][0])

In [None]:
#Remove duplicates
dataset = dataset[[0, 1]].drop_duplicates(subset=[0]).reset_index(drop=True)

In [None]:
dataset

Unnamed: 0,0,1
0,52454,cde5e
1,8943e,deb38
2,8c74a,bf548
3,8b14e,ab748
4,8c76c,fb368
...,...,...
83836,9367c,8b379
83837,8fb3a,6a938
83838,4a003,9770d
83839,44904,0090d


In [None]:
#Transforms every row to number and splits them into two dataframes.
def datasetTextToFeatures(df):
  x = pd.DataFrame(index=df.index,
                   columns=range(string_len),
                   dtype=int)
  y = pd.DataFrame(index=df.index,
                   columns=range(string_len),
                   dtype=int)
  for row in df.index:
      x.loc[row] = pd.Series(textToFeatures(df[0][row], alnum_str))
      y.loc[row] = pd.Series(textToFeatures(df[1][row], alnum_str))
  return x, y

In [None]:
feat_data, feat_test = train_test_split(dataset, test_size = 0.2)

In [None]:
data_x, data_y = datasetTextToFeatures(feat_data)

In [None]:
test_x, test_y = datasetTextToFeatures(feat_test)

In [None]:
import tensorflow as tf
from tensorflow.keras import models, layers, Input

In [None]:
#Loss function. Gets the reciprocal of the proportion of correct characters.
#Uses hamming distance.
def lossFunc(y_true, y_pred):
  """y_true_np = y_true.numpy()
  y_pred_np = y_pred.numpy()
  y_pred_np = y_pred_np.round()
  #print(y_true_np)
  #print(y_pred_np)

  cases = len(y_true_np)
  char_num = len(y_true_np[0])
  collected = []
  for i in range(cases):
    char_correct = 1
    for e in range(char_num):
      if y_true_np[i, e] == y_pred_np[i, e]:
        char_correct += 1
    collected.append(((char_num + 1) / char_correct) - 1)
  print(collected)
  return collected"""
  return tf.norm(y_true - y_pred, axis=-1)

In [82]:
model = models.Sequential([
    Input(shape=(5)),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(5, activation='relu'),
])

In [83]:
model.compile(loss=lossFunc, optimizer='adam')
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 64)                384       
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 5)                 325       
                                                                 
Total params: 4869 (19.02 KB)
Trainable params: 4869 (19.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [84]:
model.fit(x=data_x, y=data_y, epochs=300, validation_data=(test_x, test_y))

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.src.callbacks.History at 0x7b103fe7b790>

In [85]:
rand_hash = pd.Series(textToFeatures(dataset[0][0], alnum_str))
pred = model.predict(x=[list(rand_hash)])
print(list(rand_hash), dataset[0][0])
print(np.round(pred), valuesToText(np.round(pred)[0].astype(int), alnum_str))

[5, 2, 4, 5, 4] 52454
[[ 6. 10.  7. 10.  9.]] 6a7a9
