In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import perf_counter


In [15]:
# NumPy Operations

# Create an array
arr = np.arange(1, 6)  # [1 2 3 4 5]

# Elementwise operations
print("arr:", arr)
print("arr + 5:", arr + 5)            # add scalar
print("arr * 2:", arr * 2)            # multiply scalar
print("arr ** 2:", arr ** 2)          # square
print("np.sin(arr):", np.sin(arr))    # elementwise ufunc

# Broadcasting with another array
other = np.array([10, 20, 30, 40, 50])
print("arr + other:", arr + other)

# Compare loop vs vectorized execution.
rng = np.random.default_rng(0)
n = 5_000_000
x = rng.random(n)

def python_loop(x_list):
    total = 0.0
    for v in x_list:
        total += v*v + 2*v + 1
    return total

def numpy_vectorized(x):
    y = x*x + 2*x + 1
    return np.sum(y)

# Time loop
t0 = perf_counter()
sum_loop = python_loop(x.tolist())
t1 = perf_counter()

# Time NumPy vectorized
t2 = perf_counter()
sum_vec = numpy_vectorized(x)
t3 = perf_counter()

print(f"loop: {t1 - t0:.3f}s")
print(f"NumPy vectorized: {t3 - t2:.3f}s")


arr: [1 2 3 4 5]
arr + 5: [ 6  7  8  9 10]
arr * 2: [ 2  4  6  8 10]
arr ** 2: [ 1  4  9 16 25]
np.sin(arr): [ 0.84147098  0.90929743  0.14112001 -0.7568025  -0.95892427]
arr + other: [11 22 33 44 55]
loop: 0.228s
NumPy vectorized: 0.010s


In [16]:
# Dataset Loading

df = pd.read_csv("data/starter_data.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 368.0+ bytes
None
  category  value        date
0        A     10  2025-08-01
1        B     15  2025-08-02
2        A     12  2025-08-03
3        B     18  2025-08-04
4        C     25  2025-08-05


In [23]:
# Summary Statistics

summary_df = df.describe()
print(summary_df)

# Group by category
group_df = df.groupby("category").mean(numeric_only=True)
print(group_df)

           value
count  10.000000
mean   17.600000
std     7.381659
min    10.000000
25%    12.250000
50%    14.500000
75%    23.250000
max    30.000000
              value
category           
A         11.500000
B         15.666667
C         27.666667


In [25]:
# Save Outputs

# Save summary stats to CSV
summary_df.to_csv("data/processed/summary.csv", index=False)

# Bonus
df["value"].hist()
plt.title("Histogram")
plt.savefig("data/processed/histogram.png")
plt.close()


In [26]:
# Reusable Functions

def get_summary_stats(df):
    """Return summary statistics for a df."""
    return df.describe()
