In [2]:
import pandas as pd

df = pd.DataFrame({
    "User": [1, 3, 2, 1, 2, 3, 1],
    "Movie": [3, 2, 3, 2, 1, 3, 1],
    "Viewing Hours": [0.6, 1.3, 0.6, 0.7, 0.1, 0.5, 0.9]
})
print(df.to_string(index=False))

user_stats = df.groupby("User")["Viewing Hours"].agg(["sum", "count"])

avg_total_hours = user_stats["sum"].mean()

avg_movies_watched = user_stats["count"].mean()

user_stats["avg_per_movie"] = user_stats["sum"] / user_stats["count"]

print(avg_total_hours)
print(avg_movies_watched)
print(user_stats["avg_per_movie"])

 User  Movie  Viewing Hours
    1      3            0.6
    3      2            1.3
    2      3            0.6
    1      2            0.7
    2      1            0.1
    3      3            0.5
    1      1            0.9
1.5666666666666667
2.3333333333333335
User
1    0.733333
2    0.350000
3    0.900000
Name: avg_per_movie, dtype: float64


9.1 Pandas Groupwise Operations
9.2 Reshaping and Windowing of Dense Tensors

In [3]:
import numpy as np

data = np.arange(10)
window_size = 3

item_size = data.itemsize 

windowed_view = np.lib.stride_tricks.as_strided(
    data,
    shape=(len(data) - window_size + 1, window_size),
    strides=(item_size, item_size)
)

print("Original Data:", data)
print("Windowed View (Stride-based):\n", windowed_view)

data[0] = 99
print("\nUpdated View (Proves it is a view, not a copy):\n", windowed_view[0])

Original Data: [0 1 2 3 4 5 6 7 8 9]
Windowed View (Stride-based):
 [[0 1 2]
 [1 2 3]
 [2 3 4]
 [3 4 5]
 [4 5 6]
 [5 6 7]
 [6 7 8]
 [7 8 9]]

Updated View (Proves it is a view, not a copy):
 [99  1  2]


# 9.4 Bucketizing Values
# 9.5 Segment-wise Aggregation
# 9.6 Case Study: EmbeddingBag

In [4]:
import numpy as np
import pandas as pd

# --- 9.4 Bucketizing Values ---
ages = np.array([12, 25, 38, 45, 18, 62])
bins = [0, 18, 35, 60, 100]
# Assign each age to a bucket index
bucket_indices = np.digitize(ages, bins)

# --- 9.5 Segment-wise Aggregation ---
values = np.array([10, 20, 30, 40, 50])
segments = np.array([0, 0, 1, 1, 1])  # 2 in group 0, 3 in group 1
# Sum values belonging to each segment
segment_sums = np.bincount(segments, weights=values)

# --- 9.6 EmbeddingBag Simulation ---
# 3 samples, variable lengths, flattened into one array
flat_indices = np.array([1, 4, 2, 0, 5, 3]) 
offsets = np.array([0, 2, 5]) # Start of each "bag"
weights = np.random.rand(10, 4) # 10 possible items, 4-dim embedding

def embedding_bag_sim(indices, offsets, weight_table):
    # Split flat indices into bags based on offsets
    bags = np.split(indices, offsets[1:])
    # For each bag, lookup embeddings and mean-reduce
    output = [np.mean(weight_table[bag], axis=0) for bag in bags]
    return np.array(output)

bag_result = embedding_bag_sim(flat_indices, offsets, weights)

print("9.4 Bucket Indices:", bucket_indices)
print("9.5 Segment Sums:", segment_sums)
print("9.6 EmbeddingBag Shape:", bag_result.shape)

9.4 Bucket Indices: [1 2 3 3 2 4]
9.5 Segment Sums: [ 30. 120.]
9.6 EmbeddingBag Shape: (3, 4)


In [5]:
import numpy as np

arr = np.array([40, 10, 30, 10, 20])
first_names = np.array(['Bob', 'Alice', 'Bob'])
last_names = np.array(['Smith', 'Jones', 'Williams'])

sorted_arr = np.sort(arr)

sort_indices = np.argsort(arr)

lex_indices = np.lexsort((first_names, last_names))

unique_vals, unique_indices = np.unique(arr, return_index=True)

max_val = np.max(arr)
max_idx = np.argmax(arr)

min_val = np.min(arr)
min_idx = np.argmin(arr)

k = 2
top_k_idx = np.argpartition(arr, -k)[-k:]
top_k_vals = arr[top_k_idx]

partitioned_arr = np.partition(arr, 2)
part_indices = np.argpartition(arr, 2)

print("Sort:", sorted_arr)
print("Argsort:", sort_indices)
print("Lexsort:", lex_indices)
print("Unique:", unique_vals)
print("Max/Argmax:", max_val, "@", max_idx)
print("Min/Argmin:", min_val, "@", min_idx)
print("Top-K:", top_k_vals)
print("Partition:", partitioned_arr)

Sort: [10 10 20 30 40]
Argsort: [1 3 4 2 0]
Lexsort: [1 0 2]
Unique: [10 20 30 40]
Max/Argmax: 40 @ 0
Min/Argmin: 10 @ 1
Top-K: [30 40]
Partition: [10 10 20 40 30]
