# Loading and Exploring Data

In [1]:
import numpy as np

In [2]:
sales = np.array([
    [250, 320, 410, 290],  
    [120, 150, 205, 230],  
    [315, 290, 285, 320]  
])

In [3]:
sales.shape       # Dimensions (rows, columns)

(3, 4)

In [4]:
sales.ndim        # Number of dimensions

2

In [5]:
sales.size        # Total number of elements

12

In [6]:
sales.dtype       # Data type

dtype('int64')

In [7]:
np.mean(sales)    # Overall average

np.float64(265.4166666666667)

In [8]:
np.min(sales)     # Minimum value

np.int64(120)

In [9]:
np.max(sales)     # Maximum value

np.int64(410)

In [10]:
np.median(sales)  # Middle value

np.float64(287.5)

# Calculating Statistics by Row/Column

In [11]:
# Sum for each row (product)
product_totals = np.sum(sales, axis=1)
product_totals

array([1270,  705, 1210])

In [12]:
# Average for each row (product)
product_means = np.mean(sales, axis=1)
product_means

array([317.5 , 176.25, 302.5 ])

In [13]:
# Sum for each column (quarter)
quarter_totals = np.sum(sales, axis=0)
quarter_totals    

array([685, 760, 900, 840])

In [14]:
# Average for each column (quarter)
quarter_means = np.mean(sales, axis=0)
quarter_means     

array([228.33333333, 253.33333333, 300.        , 280.        ])

In [15]:
# Index of highest selling product
best_product = np.argmax(product_totals)
best_product      

np.int64(0)

In [16]:
# Index of highest selling quarter
best_quarter = np.argmax(quarter_totals)
best_quarter      

np.int64(2)

# Data Filtering and Selection

In [17]:
# Filter values above a threshold
high_sales = sales > 300
high_sales

array([[False,  True,  True, False],
       [False, False, False, False],
       [ True, False, False,  True]])

In [18]:
# Count high sales
np.count_nonzero(high_sales)

4

In [19]:
# Get just the high sales values
sales[high_sales]

array([320, 410, 315, 320])

In [20]:
# Find products with at least one high sales quarter
products_with_high = np.any(sales > 300, axis=1)
products_with_high

array([ True, False,  True])

In [21]:
# Find quarters where all products did well
good_quarters = np.all(sales > 200, axis=0)
good_quarters


array([False, False,  True,  True])

# Data Transformation

In [22]:
# Convert sales to thousands
sales_k = sales / 1000
sales_k

array([[0.25 , 0.32 , 0.41 , 0.29 ],
       [0.12 , 0.15 , 0.205, 0.23 ],
       [0.315, 0.29 , 0.285, 0.32 ]])

In [23]:
# Calculate percent of total for each value
sales_total = np.sum(sales)
pct_of_total = sales / sales_total * 100
pct_of_total

array([[ 7.84929356, 10.04709576, 12.87284144,  9.10518053],
       [ 3.76766091,  4.70957614,  6.43642072,  7.22135008],
       [ 9.89010989,  9.10518053,  8.94819466, 10.04709576]])

In [24]:
# Calculate percent of product total for each quarter
product_totals = np.sum(sales, axis=1, keepdims=True)
pct_of_product = sales / product_totals * 100
pct_of_product


array([[19.68503937, 25.19685039, 32.28346457, 22.83464567],
       [17.0212766 , 21.27659574, 29.07801418, 32.62411348],
       [26.03305785, 23.96694215, 23.55371901, 26.44628099]])

In [25]:
# Growth rates between quarters
# First calculate quarter-to-quarter change
# axis=1 means across columns (quarters)
quarter_growth = np.diff(sales, axis=1)
quarter_growth

array([[  70,   90, -120],
       [  30,   55,   25],
       [ -25,   -5,   35]])

In [26]:
# Convert to percentage growth
quarter_growth_pct = quarter_growth / sales[:, :-1] * 100
quarter_growth_pct

array([[ 28.        ,  28.125     , -29.26829268],
       [ 25.        ,  36.66666667,  12.19512195],
       [ -7.93650794,  -1.72413793,  12.28070175]])

# Working with Missing Data

In [27]:
sales_with_missing = np.array([
    [250, 320, np.nan, 290],
    [120, np.nan, 205, 230],
    [315, 290, 285, np.nan]
])

In [28]:
# Check for missing values
has_missing = np.isnan(sales_with_missing)
has_missing

array([[False, False,  True, False],
       [False,  True, False, False],
       [False, False, False,  True]])

In [29]:
# Count missing values
missing_count = np.count_nonzero(np.isnan(sales_with_missing))
missing_count

3

In [30]:
# Fill missing values with mean of each row
row_means = np.nanmean(sales_with_missing, axis=1, keepdims=True)
filled_by_product = np.where(np.isnan(sales_with_missing), row_means, sales_with_missing)
filled_by_product

array([[250.        , 320.        , 286.66666667, 290.        ],
       [120.        , 185.        , 205.        , 230.        ],
       [315.        , 290.        , 285.        , 296.66666667]])

In [31]:
# Fill missing values with column means
col_means = np.nanmean(sales_with_missing, axis=0)
filled_by_quarter = np.where(np.isnan(sales_with_missing), col_means, sales_with_missing)
filled_by_quarter

array([[250., 320., 245., 290.],
       [120., 305., 205., 230.],
       [315., 290., 285., 260.]])

# Time series analysis

In [32]:
days = np.arange(1, 31)
daily_sales = np.array([120, 135, 145, 125, 115, 90, 105, 
                        130, 145, 155, 140, 130, 110, 95, 
                        125, 140, 150, 160, 140, 120, 100, 
                        95, 130, 150, 165, 155, 140, 130, 115, 110])

In [33]:
# Calculate moving average (7-day window)
def moving_average(data, window):
    weights = np.ones(window) / window
    return np.convolve(data, weights, mode='valid')

sales_ma7 = moving_average(daily_sales, 7)
sales_ma7

array([119.28571429, 120.71428571, 122.14285714, 123.57142857,
       125.71428571, 127.85714286, 130.71428571, 129.28571429,
       128.57142857, 127.85714286, 127.14285714, 130.        ,
       131.42857143, 132.85714286, 133.57142857, 129.28571429,
       127.85714286, 127.85714286, 128.57142857, 130.71428571,
       133.57142857, 137.85714286, 140.71428571, 137.85714286])

In [34]:
# Find days with sales above trend
above_trend = daily_sales[6:] > sales_ma7
above_trend

array([False,  True,  True,  True,  True,  True, False, False, False,
        True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True, False, False, False])

In [35]:
# Cumulative sales
cumulative_sales = np.cumsum(daily_sales)
cumulative_sales

array([ 120,  255,  400,  525,  640,  730,  835,  965, 1110, 1265, 1405,
       1535, 1645, 1740, 1865, 2005, 2155, 2315, 2455, 2575, 2675, 2770,
       2900, 3050, 3215, 3370, 3510, 3640, 3755, 3865])

In [36]:
# Sales by week (reshape to 4 weeks x 7 days, last 2 days ignored)
weekly_sales = daily_sales[:28].reshape(4, 7)
weekly_sales

array([[120, 135, 145, 125, 115,  90, 105],
       [130, 145, 155, 140, 130, 110,  95],
       [125, 140, 150, 160, 140, 120, 100],
       [ 95, 130, 150, 165, 155, 140, 130]])

In [37]:
# Weekly totals
weekly_totals = np.sum(weekly_sales, axis=1)
weekly_totals

array([835, 905, 935, 965])

# Customer Segmentation with NumPy

In [38]:
customers = np.array([
    [5, 10, 200],    # Customer 1: 5 days ago, 10 purchases, $200 avg
    [20, 3, 120],    # Customer 2
    [14, 15, 80],    # Customer 3
    [3, 18, 350],    # Customer 4
    [9, 5, 190],     # Customer 5
    [40, 2, 50],     # Customer 6
    [7, 10, 170]     # Customer 7
])

In [39]:
# Standardize each column (z-score)
customers_mean = np.mean(customers, axis=0)
customers_std = np.std(customers, axis=0)
customers_z = (customers - customers_mean) / customers_std
customers_z

array([[-0.75755306,  0.17837652,  0.37481703],
       [ 0.50503537, -1.0702591 , -0.49975604],
       [ 0.        ,  1.0702591 , -0.93704257],
       [-0.92589819,  1.60538865,  2.01464153],
       [-0.42086281, -0.71350607,  0.2654954 ],
       [ 2.18848662, -1.24863562, -1.26500747],
       [-0.58920794,  0.17837652,  0.04685213]])

In [40]:
# Calculate overall customer score (first column reversed since lower recency is better)
customer_score = (-customers_z[:, 0] + customers_z[:, 1] + customers_z[:, 2]) / 3
customer_score


array([ 0.43691554, -0.6916835 ,  0.04440551,  1.51530946, -0.00904929,
       -1.56737657,  0.27147886])

In [41]:
# Find top 3 customers
top_indices = np.argsort(customer_score)[-3:][::-1]
top_indices
customers[top_indices]

array([[  3,  18, 350],
       [  5,  10, 200],
       [  7,  10, 170]])