# A meta analysis of overfitting in machine learning
 Computation of the expected value for overfitting on private test datasets.

In [2]:
import pandas as pd
import math
# this is a sample dataset which has 2 error samples (0,1) and (1,1) which has ambiguous labels
data = pd.DataFrame({
    'f1':    [0, 0, 1, 1, 0, 0, 0, 1, 1, 1],
    'f2':    [0, 1, 0, 1, 0, 1, 1, 0, 1, 1],
    'class': [0, 1, 1, 1, 0, 0, 1, 1, 1, 0]
})
data

Unnamed: 0,f1,f2,class
0,0,0,0
1,0,1,1
2,1,0,1
3,1,1,1
4,0,0,0
5,0,1,0
6,0,1,1
7,1,0,1
8,1,1,1
9,1,1,0


In [3]:
# depending on the number of error samples, calculate the number of combinations where i error samples are included in the split
def calculate_weights(num_samples, split_size, num_error_sample):
    if split_size >= num_samples:
        print("split_size cannot be larger than num_samples")

    elif split_size < num_error_sample:
        combinations = []
        for i in range(split_size+1):
            conv = math.comb((num_samples- num_error_sample), (split_size-i)) * math.comb(num_error_sample, i)
            combinations.append(conv)
    else:
        combinations = []
        for i in range(num_error_sample+1):
            conv = math.comb((num_samples- num_error_sample), (split_size-i)) * math.comb(num_error_sample, i)
            combinations.append(conv)
    return combinations, sum(combinations)

In [4]:
calculate_weights(10, 4, 2)

([70, 112, 28], 210)

### Explanation 

Given:
- num_samples = N
- split_size = k
- num_error_sample = E

It returns:
- combinations: counts of k-size subsets with exactly i errors, for i = 0..min(E, k)
- total: sum of those counts (equals C(N, k) for valid inputs)

Each term:
- combinations[i] = C(E, i) × C(N − E, k − i)

Example:
- calculate_weights(10, 4, 2) → ([70, 112, 28], 210)
  - 70 combinations with 0 errors, 112 combinations with 1 error, 28 combinatins with 2 errors; total 210 = C(10, 4)


In [5]:
# test error calculation
test_error = (0 * 70 + .25* 112 + 28* .5)/210
test_error

0.2

In [6]:
split_no = [1 , 2, 3, 4, 5]
error = [0.2, 0.2, 0.2, 0.2, .2]    # calculated error in the private split

In [7]:
# another dataset with error samples on 0, 1 combintions samples
import os
os.chdir("..")
from  src.discrete import calculate_discrete_ambiguity, calculate_discrete_error
data = pd.DataFrame({
    'f1':    [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0],
    'f2':    [0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1],
    'class': [0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1]
})
data

Unnamed: 0,f1,f2,class
0,0,0,0
1,0,1,1
2,1,0,1
3,1,1,1
4,0,0,0
5,0,1,0
6,0,1,1
7,1,0,1
8,1,1,1
9,0,1,0


In [8]:
# test ambiguity calculation on this dataset
ambiguity = calculate_discrete_ambiguity(data, 'class')
ambiguity

0.2