In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
# Load cleaned data
DATA_PATH = Path("../data/processed/draws_clean.csv")
df = pd.read_csv(DATA_PATH, parse_dates=["draw_date"])
df.head()

Unnamed: 0,draw_no,first_prize,second_prize_1,second_prize_2,second_prize_3,city,draw_date
0,1,367030,211403,527674,676693,Islamabad,2000-01-15
1,2,752300,38121,338558,367788,Peshawar,2000-04-17
2,3,941694,248225,267064,609209,Hyderabad,2000-07-15
3,4,548378,301073,472750,578505,Lahore,2000-10-16
4,5,677216,195498,404323,677256,Muzaffarabad,2001-01-15


In [3]:
PRIZE_COLUMNS = [
    "first_prize",
    "second_prize_1",
    "second_prize_2",
    "second_prize_3"
]

all_numbers = df[PRIZE_COLUMNS].values.flatten()

len(all_numbers), all_numbers[:10]

(400,
 array([367030, 211403, 527674, 676693, 752300,  38121, 338558, 367788,
        941694, 248225]))

In [4]:
print("Min number:", all_numbers.min())
print("Max number:", all_numbers.max())
print("Unique numbers:", len(np.unique(all_numbers)))

Min number: 831
Max number: 991249
Unique numbers: 400


In [5]:
from scipy.stats import chisquare

# Count frequency of each winning number
values, counts = np.unique(all_numbers, return_counts=True)

# Expected frequency under uniform randomness
expected = np.full_like(counts, counts.mean(), dtype=float)

chi_stat, p_value = chisquare(counts, expected)

chi_stat, p_value

(np.float64(0.0), np.float64(1.0))

In [6]:
last_digits = all_numbers % 10

digit_counts = np.bincount(last_digits, minlength=10)

expected_digits = np.full(10, digit_counts.mean())

chi_stat_digits, p_value_digits = chisquare(digit_counts, expected_digits)

chi_stat_digits, p_value_digits


(np.float64(8.8), np.float64(0.4559371952206619))

In [7]:
# Use first prize only for temporal testing
first_prize_series = df["first_prize"].values

# Lag-1 autocorrelation
autocorr = np.corrcoef(first_prize_series[:-1], first_prize_series[1:])[0, 1]

autocorr

np.float64(-0.03579925161519945)

In [8]:
from statsmodels.sandbox.stats.runs import runstest_1samp

median = np.median(all_numbers)
runs_stat, runs_p = runstest_1samp(all_numbers, cutoff=median)

runs_stat, runs_p


(np.float64(-2.302887634533312), np.float64(0.021285166071197768))