# Predicting T-shirt size using the ANSUR II dataset
We will here try to predict a persons t-shirt size given the weight and height of the person. We will use the ANSUR II dataset which contains a lot of information about the physical attributes of a large number of people.
 
We will first try to map the persons in the dataset to a t-shirt size. It is hard to find a concise size chart for t-shirt so we will create our own, initial chart, based on these assumptions:
 
We will only look at two measurements, Shoulder Width and Chest Circumference.
 
Our first problem is that Shoulder Width is not one of the measurements taken in the dataset. But we have Biacromial Breadth which is the distance between the two acromion processes. We will assume that this is the same as Shoulder Width.
 
We will then have these initial rules:
 
| Size | Percentile |
|------|------------|
| XS   | 0-5        |
| S    | 5-25       |
| M    | 25-50      |
| L    | 50-75      |
| XL   | 75-90      |
| XXL  | 90-97      |
| XXXL | 97-100     |

### Inspect the data

In [1]:
import pandas as pd

In [2]:
female = pd.read_csv('./data/female.csv')
male = pd.read_csv('./data/male.csv')

In [3]:
print(f'For women we have (rows, columns) {female.shape}')
print(f'For men we have (rows, columns) {male.shape}')

For women we have (rows, columns) (1986, 108)
For men we have (rows, columns) (4082, 108)


### Checking the percentiles - Let us decide percentile of data

In [4]:
def compute_percentile_ranges(column):
    ranges = [(0,5),(5,25),(25,50),(50,75),(75,90),(90,97),(97,100)]

    percentiles = { (low,high): (column.quantile(low/100), column.quantile(high/100)) for low, high in ranges}
    
    print(percentiles)

    counts = {}

    for r,(low, high) in percentiles.items():
        counts[r] = ((column >= low)& (column < high)).sum()
    return counts
print(compute_percentile_ranges(female['chestcircumference']))
print(compute_percentile_ranges(female['biacromialbreadth']))
print(compute_percentile_ranges(male['chestcircumference']))
print(compute_percentile_ranges(male['biacromialbreadth']))

{(0, 5): (695.0, 824.25), (5, 25): (824.25, 889.0), (25, 50): (889.0, 940.0), (50, 75): (940.0, 999.0), (75, 90): (999.0, 1057.0), (90, 97): (1057.0, 1117.45), (97, 100): (1117.45, 1266.0)}
{(0, 5): 100, (5, 25): 396, (25, 50): 492, (50, 75): 499, (75, 90): 299, (90, 97): 140, (97, 100): 59}
{(0, 5): (283.0, 335.0), (5, 25): (335.0, 353.0), (25, 50): (353.0, 365.0), (50, 75): (365.0, 378.0), (75, 90): (378.0, 389.0), (90, 97): (389.0, 400.0), (97, 100): (400.0, 422.0)}
{(0, 5): 93, (5, 25): 377, (25, 50): 477, (50, 75): 541, (75, 90): 297, (90, 97): 139, (97, 100): 61}
{(0, 5): (774.0, 922.0), (5, 25): (922.0, 996.0), (25, 50): (996.0, 1056.0), (50, 75): (1056.0, 1117.0), (75, 90): (1117.0, 1172.0), (90, 97): (1172.0, 1233.0), (97, 100): (1233.0, 1469.0)}
{(0, 5): 199, (5, 25): 810, (25, 50): 1025, (50, 75): 1012, (75, 90): 616, (90, 97): 295, (97, 100): 124}
{(0, 5): (337.0, 384.0), (5, 25): (384.0, 403.0), (25, 50): (403.0, 415.0), (50, 75): (415.0, 428.0), (75, 90): (428.0, 441.0), 

### Generate the tshirt size chart

In [5]:
def compute_size_percentile_measurements(data, chest_column, shoulder_column):
    sizes = ['XS', 'S', 'M', 'L', 'XL', 'XXL', 'XXXL']
    ranges = [0,5,25,50,75,90,97]

    # Compute the values for each percentile for chest and shoulder
    chest_percentiles = {p:data[chest_column].quantile(p/100) for p in ranges}
    shoulder_percentiles = {p:data[shoulder_column].quantile(p/100) for p in ranges}
    #print(chest_percentiles)
    #print(shoulder_percentiles)

    # Map the tshrit sizes to the corresponding chest and shoulder measurements
    size_mappings = {}
    for i, size in enumerate(sizes):
        size_mappings[size] = {
            'Chest': int(chest_percentiles[ranges[i]]),
            'Shoulder': int(shoulder_percentiles[ranges[i]])
        }
    return size_mappings

print(compute_size_percentile_measurements(female,'chestcircumference','biacromialbreadth'))
print(compute_size_percentile_measurements(male,'chestcircumference','biacromialbreadth'))

{'XS': {'Chest': 695, 'Shoulder': 283}, 'S': {'Chest': 824, 'Shoulder': 335}, 'M': {'Chest': 889, 'Shoulder': 353}, 'L': {'Chest': 940, 'Shoulder': 365}, 'XL': {'Chest': 999, 'Shoulder': 378}, 'XXL': {'Chest': 1057, 'Shoulder': 389}, 'XXXL': {'Chest': 1117, 'Shoulder': 400}}
{'XS': {'Chest': 774, 'Shoulder': 337}, 'S': {'Chest': 922, 'Shoulder': 384}, 'M': {'Chest': 996, 'Shoulder': 403}, 'L': {'Chest': 1056, 'Shoulder': 415}, 'XL': {'Chest': 1117, 'Shoulder': 428}, 'XXL': {'Chest': 1172, 'Shoulder': 441}, 'XXXL': {'Chest': 1233, 'Shoulder': 452}}


In [6]:
female_sizes = {
    'XS': {'Chest': 695, 'Shoulder': 283}, 
    'S': {'Chest': 824, 'Shoulder': 335}, 
    'M': {'Chest': 889, 'Shoulder': 353}, 
    'L': {'Chest': 940, 'Shoulder': 365}, 
    'XL': {'Chest': 999, 'Shoulder': 378}, 
    'XXL': {'Chest': 1057, 'Shoulder': 389}, 
    'XXXL': {'Chest': 1117, 'Shoulder': 400}
    }

male_sizes = {
    'XS': {'Chest': 774, 'Shoulder': 337}, 
    'S': {'Chest': 922, 'Shoulder': 384}, 
    'M': {'Chest': 996, 'Shoulder': 403}, 
    'L': {'Chest': 1056, 'Shoulder': 415}, 
    'XL': {'Chest': 1117, 'Shoulder': 428}, 
    'XXL': {'Chest': 1172, 'Shoulder': 441}, 
    'XXXL': {'Chest': 1233, 'Shoulder': 452}
    }

In [7]:
def get_size(data, size_chart):
    matches = {size: 0 for size in size_chart.keys()}
    ties = 0

    for _, row in data.iterrows():
        possible_sizes = []

        for size, measurements in size_chart.items():
            if (row['biacromialbreadth'] <= measurements['Shoulder'] and
                row['chestcircumference'] <= measurements['Chest']):
                possible_sizes.append(size)
        
        if len(possible_sizes) == 1:
            matches[possible_sizes[0]] += 1
        elif len(possible_sizes) > 1:
            ties += 1
    
    return matches, ties

In [8]:
female_matches, female_ties = get_size(female, female_sizes)
male_matches, male_ties = get_size(male, male_sizes)

print('Female matches: ', female_matches)
print('Female ties: ', female_ties)
print('Male matches: ', male_matches)
print('Male ties: ', male_ties)

Female matches:  {'XS': 0, 'S': 0, 'M': 0, 'L': 0, 'XL': 0, 'XXL': 0, 'XXXL': 236}
Female ties:  1642
Male matches:  {'XS': 0, 'S': 0, 'M': 0, 'L': 0, 'XL': 0, 'XXL': 0, 'XXXL': 434}
Male ties:  3437


### Overlapping Function

In [9]:
def create_overlapping_size_chart(original_chart):
    overlapping_chart = {}

    sizes = list(original_chart.keys())

    for i, size in enumerate(sizes):
        overlapping_chart[size] = {}
        if i == 0:
            overlapping_chart[size]['Chest'] = [original_chart[size]['Chest'], original_chart[sizes[i+1]]['Chest']+5]
            overlapping_chart[size]['Shoulder'] = [original_chart[size]['Shoulder'], original_chart[sizes[i+1]]['Shoulder']+5]

        elif i ==len(sizes)-1:
            overlapping_chart[size]['Chest'] = [original_chart[size]['Chest']-5, original_chart[size]['Chest']+1000]
            overlapping_chart[size]['Shoulder'] = [original_chart[size]['Shoulder']-5, original_chart[size]['Shoulder']+1000]

        else:
            overlapping_chart[size]['Chest'] = [original_chart[size]['Chest']-5, original_chart[sizes[i+1]]['Chest']+5]
            overlapping_chart[size]['Shoulder'] = [original_chart[size]['Shoulder']-5, original_chart[sizes[i+1]]['Shoulder']+5]
        
    return overlapping_chart

In [10]:
new_female_sizes = create_overlapping_size_chart(female_sizes)
new_male_sizes = create_overlapping_size_chart(male_sizes)

for k, v in new_female_sizes.items():
    print(f"'{k}' : {v}, ")

print()

for k, v in new_male_sizes.items():
    print(f"'{k}' : {v}, ")

'XS' : {'Chest': [695, 829], 'Shoulder': [283, 340]}, 
'S' : {'Chest': [819, 894], 'Shoulder': [330, 358]}, 
'M' : {'Chest': [884, 945], 'Shoulder': [348, 370]}, 
'L' : {'Chest': [935, 1004], 'Shoulder': [360, 383]}, 
'XL' : {'Chest': [994, 1062], 'Shoulder': [373, 394]}, 
'XXL' : {'Chest': [1052, 1122], 'Shoulder': [384, 405]}, 
'XXXL' : {'Chest': [1112, 2117], 'Shoulder': [395, 1400]}, 

'XS' : {'Chest': [774, 927], 'Shoulder': [337, 389]}, 
'S' : {'Chest': [917, 1001], 'Shoulder': [379, 408]}, 
'M' : {'Chest': [991, 1061], 'Shoulder': [398, 420]}, 
'L' : {'Chest': [1051, 1122], 'Shoulder': [410, 433]}, 
'XL' : {'Chest': [1112, 1177], 'Shoulder': [423, 446]}, 
'XXL' : {'Chest': [1167, 1238], 'Shoulder': [436, 457]}, 
'XXXL' : {'Chest': [1228, 2233], 'Shoulder': [447, 1452]}, 


### Analyse the data and using new size charts to find out number of matches using two measurements 

In [11]:
def get_size_with_ranges(data, size_chart):
    matches = {size: 0 for size in size_chart.keys()}
    ties = 0

    for _, row in data.iterrows():
        possible_sizes = []

        for size, measurements in size_chart.items():
            chest_min, chest_max = measurements['Chest']
            shoulder_min, shoulder_max = measurements['Shoulder']

            if (chest_min <= row['chestcircumference'] <= chest_max and
                shoulder_min <= row['biacromialbreadth'] <= shoulder_max):
                possible_sizes.append(size)

        if len(possible_sizes) == 1:
            matches[possible_sizes[0]] += 1
        elif len(possible_sizes) > 1:
            ties += 1

    return matches, ties

In [12]:
female_matches, female_ties = get_size_with_ranges(female, new_female_sizes)
male_matches, male_ties = get_size_with_ranges(male, new_male_sizes)

print("Female matches:", female_matches)
print("Female ties:", female_ties)

print("Male matches:", male_matches)
print("Male ties:", male_ties)

Female matches: {'XS': 23, 'S': 180, 'M': 230, 'L': 248, 'XL': 108, 'XXL': 30, 'XXXL': 11}
Female ties: 67
Male matches: {'XS': 63, 'S': 419, 'M': 542, 'L': 532, 'XL': 287, 'XXL': 88, 'XXXL': 47}
Male ties: 166


### Handle ties by choosing the larger adjacent size

In [None]:
def get_size_with_ranges_and_tie_break(data, size_chart):
    size_order = list(size_chart.keys())
    matches = {size: 0 for size in size_order}
    ties = 0

    for _, row in data.iterrows():
        possible_sizes = []

        for size in size_order:
            chest_min, chest_max = size_chart[size]['Chest']
            shoulder_min, shoulder_max = size_chart[size]['Shoulder']

            if (chest_min <= row['chestcircumference'] <= chest_max and
                shoulder_min <= row['biacromialbreadth'] <= shoulder_max):
                possible_sizes.append(size)

        if len(possible_sizes) == 1:
            matches[possible_sizes[0]] += 1

        elif len(possible_sizes) > 1:
            chosen_size = possible_sizes[-1]
            matches[chosen_size] += 1
            ties += 1

    return matches, ties


In [14]:
female_matches, female_ties = get_size_with_ranges_and_tie_break(
    female, new_female_sizes
)

male_matches, male_ties = get_size_with_ranges_and_tie_break(
    male, new_male_sizes
)

print("Female matches:", female_matches)
print("Female ties resolved:", female_ties)

print("Male matches:", male_matches)
print("Male ties resolved:", male_ties)


Female matches: {'XS': 23, 'S': 185, 'M': 247, 'L': 276, 'XL': 118, 'XXL': 35, 'XXXL': 13}
Female ties resolved: 67
Male matches: {'XS': 63, 'S': 428, 'M': 578, 'L': 593, 'XL': 331, 'XXL': 101, 'XXXL': 50}
Male ties resolved: 166
