In [None]:
import numpy as np

str_data = np.array(["apple", "banadfsd324na", "cherry"], dtype='U10') #utf-32 10 max chars

bytes_data = np.array(["apple", "banana", "cherry"], dtype='S10') # ascii 1 byte

obj_data = np.array(["apple", "banana", "cherry"], dtype=object)  #python string objects for words 

str_itemsize = str_data.itemsize
bytes_itemsize = bytes_data.itemsize
obj_itemsize = obj_data.itemsize

print(str_data)
print(str_itemsize)
print(bytes_itemsize)
print(obj_itemsize)

['apple' 'banadfsd32' 'cherry']
40
10
8


In [None]:
import pandas as pd


raw_data = ["vector", "matrix", None, "tensor"]

df = pd.DataFrame({"legacy": raw_data})
df["modern"] = pd.Series(raw_data, dtype="string")

legacy_null = df.loc[2, "legacy"]
modern_null = df.loc[2, "modern"]

legacy_type = type(legacy_null)
modern_type = type(modern_null)

print(df.dtypes)
print(legacy_type)
print(modern_type)

legacy            object
modern    string[python]
dtype: object
<class 'NoneType'>
<class 'pandas._libs.missing.NAType'>


In [5]:
import numpy as np
import pandas as pd

arr = np.array(["  vectorization  ", "ALGORITHM", "Machine Learning"], dtype="U")

stripped = np.char.strip(arr)
lowered = np.char.lower(stripped)
replaced = np.char.replace(lowered, " ", "_")

s = pd.Series(["user_123", "admin_456", "guest_789"], dtype="string")

contains_admin = s.str.contains("admin")
split_data = s.str.split("_", expand=True)
joined_data = s.str.cat(sep=" | ")

print(replaced)
print(contains_admin.to_numpy())
print(split_data.to_numpy())
print(joined_data)

['vectorization' 'algorithm' 'machine_learning']
[False  True False]
[['user' '123']
 ['admin' '456']
 ['guest' '789']]
user_123 | admin_456 | guest_789


# Date formating

In [None]:
import numpy as np
import pandas as pd

today = pd.Timestamp.today()
date_strings = [
    (today - pd.Timedelta(days=365)).strftime("%Y-%m-%d"),
    (today - pd.Timedelta(days=100)).strftime("%Y-%m-%d"),
    today.strftime("%Y-%m-%d")
]

np_dates = np.array(date_strings, dtype="datetime64[D]")

pd_dates = pd.to_datetime(date_strings)

mixed_dates = pd.to_datetime(
    ["2025/12/31", "31-12-2025", "2025.12.31"], 
    format="mixed", 
    dayfirst=True
)

years = pd_dates.year
months = pd_dates.month
days = pd_dates.day

formatted_strings = pd_dates.strftime("%B %d, %Y")  # Remove .dt

print(np_dates)
print(pd_dates)
print(years.to_numpy())
print(formatted_strings.to_numpy())

['2024-12-31' '2025-09-22' '2025-12-31']
DatetimeIndex(['2024-12-31', '2025-09-22', '2025-12-31'], dtype='datetime64[ns]', freq=None)
[2024 2025 2025]
['December 31, 2024' 'September 22, 2025' 'December 31, 2025']


# pandas categorical


In [None]:
import pandas as pd
import numpy as np

raw_data = ["Medium", "Small", "Large", "Medium", "Small"]

cat_series = pd.Series(raw_data, dtype="category")

ordered_cat = pd.Categorical(
    raw_data, 
    categories=["Small", "Medium", "Large"], 
    ordered=True
)
df = pd.DataFrame({"Size": ordered_cat})

codes = df["Size"].cat.codes
categories = df["Size"].cat.categories

sorted_df = df.sort_values(by="Size")

print(df,"\n")
print(codes.to_numpy(),"\n")
print(categories.to_numpy(),"\n")
print(sorted_df)

     Size
0  Medium
1   Small
2   Large
3  Medium
4   Small 

[1 0 2 1 0] 

['Small' 'Medium' 'Large'] 

     Size
1   Small
4   Small
0  Medium
3  Medium
2   Large


# Label Encoders

In [11]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

train_labels = ["cat", "dog", "bird", "dog", "cat"]
test_labels = ["dog", "cat", "dog"]

encoder = LabelEncoder()

encoder.fit(train_labels)

train_encoded = encoder.transform(train_labels)
test_encoded = encoder.transform(test_labels)

original_labels = encoder.inverse_transform([0, 1, 2])

mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

print(encoder.classes_)
print(train_encoded)
print(test_encoded)
print(original_labels)
print(mapping)

['bird' 'cat' 'dog']
[1 2 0 2 1]
[2 1 2]
['bird' 'cat' 'dog']
{np.str_('bird'): np.int64(0), np.str_('cat'): np.int64(1), np.str_('dog'): np.int64(2)}


# Factorization Machine (FM) using a vectorized approach.

In [13]:
import numpy as np

def factorization_machine_layer(x, w0, w, v):
    linear_terms = w0 + np.matmul(x, w)
    
    square_of_sum = np.square(np.matmul(x, v))
    sum_of_square = np.matmul(np.square(x), np.square(v))
    
    interaction_terms = 0.5 * np.sum(square_of_sum - sum_of_square, axis=1, keepdims=True)
    
    return linear_terms + interaction_terms

n_batch = 2
n_features = 5
k_factors = 3

x = np.random.rand(n_batch, n_features)
w0 = 0.5
w = np.random.rand(n_features, 1)
v = np.random.rand(n_features, k_factors)

output = factorization_machine_layer(x, w0, w, v)

print(output.shape)
print(output)

(2, 1)
[[5.18194409]
 [3.25882855]]


# Regex

In [14]:
import pandas as pd
import re

data = pd.Series([
    "Contact: admin@site.com, Cell: 555-123-4567",
    "Reach me at user.name123@domain.org",
    "Invalid email: test@@com, Phone: 123-456-7890",
    "Price is $150.50 for 2 items"
], dtype="string")

email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
has_valid_email = data.str.contains(email_pattern, regex=True)

prices = data.str.extract(r'\$(\d+\.\d+)')

phone_pattern = r'\d{3}-\d{3}-\d{4}'
anonymized_data = data.str.replace(phone_pattern, "XXX-XXX-XXXX", regex=True)

print(has_valid_email.to_numpy())
print(prices.to_numpy())
print(anonymized_data.to_numpy())

[ True  True False False]
[[<NA>]
 [<NA>]
 [<NA>]
 ['150.50']]
['Contact: admin@site.com, Cell: XXX-XXX-XXXX'
 'Reach me at user.name123@domain.org'
 'Invalid email: test@@com, Phone: XXX-XXX-XXXX'
 'Price is $150.50 for 2 items']


# Serialize and deserialize

In [16]:
import json
import pickle

records = [
    {"id": 101, "name": "vector_processor", "active": True},
    {"id": 102, "name": "matrix_engine", "active": False},
    {"id": 103, "name": "tensor_core", "active": True}
]

json_string = json.dumps(records)
json_bytes = json_string.encode("utf-8")

deserialized_json_string = json_bytes.decode("utf-8")
reconstructed_from_json = json.loads(deserialized_json_string)

pickle_bytes = pickle.dumps(records)

reconstructed_from_pickle = pickle.loads(pickle_bytes)

print(json_bytes)
print(reconstructed_from_json == records)
print(pickle_bytes[:20])
print(reconstructed_from_pickle == records)

b'[{"id": 101, "name": "vector_processor", "active": true}, {"id": 102, "name": "matrix_engine", "active": false}, {"id": 103, "name": "tensor_core", "active": true}]'
True
b'\x80\x05\x95l\x00\x00\x00\x00\x00\x00\x00]\x94(}\x94(\x8c\x02i'
True
