In [1]:
import numpy as np

# ----------------------------
# Step 0: Create mock data
# ----------------------------
# Let's assume we have 1000 rows and 5 columns:
# Column 0: category (0, 1, 2)
# Columns 1-4: numeric data with some NaNs
np.random.seed(42)

categories = np.random.randint(0, 3, size=(1000, 1))   # categorical column
numeric_data = np.random.randn(1000, 4) * 100          # random numeric values
# Introduce NaNs randomly
mask = np.random.rand(*numeric_data.shape) < 0.1
numeric_data[mask] = np.nan

# Combine category + numeric data
data = np.hstack((categories, numeric_data))

In [4]:
# ----------------------------
# 1Ô∏è‚É£ Inspect data
# ----------------------------
print("Shape:", data.shape)
print("First 5 rows:\n", data[:5])

Shape: (1000, 5)
First 5 rows:
 [[   2.            4.85216279  -83.09501164   27.04568258   -5.02381094]
 [   0.          -23.89480469  -90.7563662   -57.67713306   75.53912258]
 [   2.                   nan  -97.75552448    9.93323054   75.13871234]
 [   2.         -166.94052811   54.33601924  -66.26237589   57.05986686]
 [   0.          -76.32591565           nan -162.75424379    4.80849467]]


In [5]:

# ----------------------------
# 2Ô∏è‚É£ Handle missing values
# ----------------------------
col_means = np.nanmean(data, axis=0)
data_no_nan = np.where(np.isnan(data), col_means, data)


In [6]:

# ----------------------------
# 3Ô∏è‚É£ Filter rows where column 2 > 50
# ----------------------------
filtered = data_no_nan[data_no_nan[:, 2] > 50]


In [7]:

# ----------------------------
# 4Ô∏è‚É£ Add normalized column (column 1)
# ----------------------------
normalized_col = (data_no_nan[:, 1] - np.min(data_no_nan[:, 1])) / \
                 (np.max(data_no_nan[:, 1]) - np.min(data_no_nan[:, 1]))
data_aug = np.column_stack((data_no_nan, normalized_col))


In [8]:

# ----------------------------
# 5Ô∏è‚É£ Convert numeric columns to integers
# ----------------------------
data_int = data_aug.astype(np.int32)


In [9]:

# ----------------------------
# 6Ô∏è‚É£ Detect outliers (> 3 std from mean)
# ----------------------------
mean, std = np.mean(data_no_nan, axis=0), np.std(data_no_nan, axis=0)
outliers = np.abs(data_no_nan - mean) > 3 * std


In [10]:

# ----------------------------
# 7Ô∏è‚É£ Group by category and compute mean of column 1
# ----------------------------
cats = data_no_nan[:, 0]
vals = data_no_nan[:, 1]
for c in np.unique(cats):
    print(f"Category {c}: mean of col1 = {np.mean(vals[cats==c])}")


Category 0.0: mean of col1 = 6.710573130294333
Category 1.0: mean of col1 = 0.17554729035816466
Category 2.0: mean of col1 = -1.2556649251435215


In [11]:

# ----------------------------
# 8Ô∏è‚É£ Remove duplicate rows
# ----------------------------
unique_rows = np.unique(data_no_nan, axis=0)


In [13]:
# ----------------------------
# 9Ô∏è‚É£ Reshape data into batches of 100 rows
# ----------------------------
batch_size = 100
num_batches = data_no_nan.shape[0] // batch_size
batches = data_no_nan[:num_batches*batch_size].reshape(num_batches, batch_size, data_no_nan.shape[1])


In [14]:
# ----------------------------
# üîü Vectorized row-wise sum
# ----------------------------
row_sums = np.sum(data_no_nan, axis=1)

print("Example row sums:", row_sums[:5])

Example row sums: [ -54.22097721  -96.78918137   -8.64465683 -119.80701791 -234.39756965]
