In [138]:
import pandas as pd
import numpy as np
import re

In [139]:
train = pd.read_csv('train.csv')

In [159]:
'''
This cell constructs a list of dictionaries. Each dictionary represents the features column of one of 
the 15539 examples in the dataset
'''
feat_dicts = []
for i in range(len(train)):
    line_dict = {}
    line = train['features'][i]
    keys = re.findall(r'(\d+):', line)
    values = re.findall(r'\d+:(\d+\.\d+)', line)
    for i in range(len(keys)):
        line_dict[keys[i]] = values[i]
    feat_dicts.append(line_dict)

In [160]:
len(feat_dicts)

15539

In [161]:
'''
As an example, here is the dictionary for the example at index 0. 
Every key represents an integer tokenization of a word from the document that the example represents
Every associated value is the corresponding TFIDF weight (https://en.wikipedia.org/wiki/Tf%E2%80%93idf)
'''
feat_dicts[0]

{'0': '0.084556',
 '1': '0.138594',
 '2': '0.094304',
 '3': '0.195764',
 '4': '0.612552',
 '5': '0.106491',
 '6': '0.137765',
 '7': '0.145839',
 '8': '0.304610',
 '10': '0.465152',
 '11': '0.182930',
 '12': '0.193397',
 '19': '2.174160',
 '20': '0.728684',
 '24': '0.401482',
 '26': '0.432680',
 '27': '2.667136',
 '29': '0.493819',
 '32': '0.573815',
 '33': '0.599316',
 '34': '0.601095',
 '38': '1.235996',
 '42': '0.664670',
 '44': '0.686222',
 '60': '0.798616',
 '64': '0.805275',
 '74': '0.866743',
 '75': '0.869891',
 '90': '0.956603',
 '91': '0.957650',
 '108': '3.150229',
 '115': '2.177111',
 '141': '1.209520',
 '163': '2.573456',
 '174': '2.645868',
 '190': '1.374023',
 '291': '1.677025',
 '300': '3.382515',
 '323': '1.752954',
 '425': '5.926553',
 '450': '2.026049',
 '484': '2.081569',
 '490': '2.087109',
 '583': '2.259480',
 '652': '2.376026',
 '684': '4.861286',
 '766': '2.563801',
 '853': '2.686631',
 '928': '5.575382',
 '1173': '6.221640',
 '1184': '3.122486',
 '1269': '9.68484

In [164]:
'''
This cell constructs a dictionary
Each key is represents the index of an example in train
The associatec value is a set (we have chosen set for ease of membership testing later on) which contains 
all the labels that are associated with the corresponding example
'''
label_dict = {}
for i in range(len(train)):
    labels = train['labels'][i]
    label_dict[i] = set(re.findall(r'(\d+)', labels))

In [165]:
label_dict

{0: {'1149', '1249', '1265', '1482', '446', '521'},
 1: {'78', '80', '85', '86'},
 2: {'1158', '457', '577', '579', '640', '939'},
 3: {'1704', '172', '654', '693'},
 4: {'1017', '1052', '1731', '3183', '403', '508'},
 5: {'1328', '174', '379', '380', '381', '449', '493', '677'},
 6: {'592', '595', '617', '694', '923'},
 7: {'1210', '446', '499', '500', '671', '804'},
 8: {'180', '611', '64'},
 9: {'1098', '1310', '1818', '3263', '505'},
 10: {'1396', '1540', '1699', '529', '531'},
 11: {'208', '210', '211', '418'},
 12: {'1022', '2149', '25', '374', '693', '878'},
 13: {'1021', '141', '499', '500', '548'},
 14: {'2256', '426', '955'},
 15: {'106', '113', '114', '1824', '2034'},
 16: {'1534', '1556', '2244', '2785'},
 17: {'1135', '297', '3', '430'},
 18: {'1543', '242', '295', '36', '64', '792'},
 19: {'1518', '160', '797', '798', '996'},
 20: {'1085', '592', '595', '609', '678'},
 21: {'1190', '2418', '299', '731', '733'},
 22: {'1983', '2090', '2537', '324', '906'},
 23: {'1983', '2

In [173]:
#Calculate label Cardinality
N = len(label_dict)
cardinality = 0
for i in range(N):
    cardinality+=len(label_dict[i])
cardinality/=N
print('Label Cardinality: ', cardinality)

Label Cardinality:  5.314820773537551


In [180]:
#Calculate Label Density
#L is the total number of labels, which is given to us
L = 3993
#The density is simply the cardinality divided by L
density = cardinality/L
print('Label Density: ', density)

Label Density:  0.0013310345037659782


We observe that the densoity is very low. This is not surprising given the large number of labels that we have in this data. 