In [26]:
import pandas as pd
import numpy as np
import re

In [27]:
train = pd.read_csv('../train.csv')

In [28]:
def data_format(train):
    '''
    This code block constructs a list of dictionaries. Each dictionary represents the 
    features column of one of the 15539 examples in the dataset
    '''
    feat_dicts = []
    for i in range(len(train)):
        line_dict = {}
        line = train['features'][i]
        keys = re.findall(r'(\d+):', line)
        values = re.findall(r'\d+:(\d+\.\d+)', line)
        for i in range(len(keys)):
            line_dict[int(keys[i])] = float(values[i])
        feat_dicts.append(line_dict)
        
    '''
    This code block constructs a dictionary
    Each key represents the index of an example in train
    The associatec value is a set (we have chosen set for ease of membership testing later on) which contains 
    all the labels that are associated with the corresponding example
    '''
    label_dict = {}
    for i in range(len(train)):
        labels = train['labels'][i]
        label_dict[i] = list(np.array(re.findall(r'(\d+)', labels)).astype('int'))
        
    return feat_dicts, label_dict

In [29]:
feat_dicts, label_dict = data_format(train)

In [4]:
train.head()

Unnamed: 0,ex_id,labels,features
0,0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
1,1,78808586,0:0.050734 1:0.762265 2:0.754431 3:0.065255 4:...
2,2,4575775796409391158,0:0.101468 1:0.138594 2:0.377215 3:0.130509 4:...
3,3,1726546931704,0:0.186024 1:0.346484 2:0.141456 3:0.195764 4:...
4,4,4035081017105217313183,0:0.135290 1:0.277187 2:0.141456 3:0.065255 4:...


In [5]:
'''
This cell demonstrates how regular expressions are used to extract the
information we need
'''
ex_line = train['features'][0]
ex_keys = re.findall(r'(\d+):', ex_line)
ex_vals = re.findall(r'\d+:(\d+\.\d+)', ex_line)

print('Original Example')
print(ex_line)
print('Extracted Keys: These represent the words themselves. Even tho these are numbers, we will be treating them as if they are text')
print(ex_keys)
print('')
print('Extracted Values(TFIDF weights)')
print(ex_vals)

Original Example
0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:0.612552 5:0.106491 6:0.137765 7:0.145839 8:0.304610 10:0.465152 11:0.182930 12:0.193397 19:2.174160 20:0.728684 24:0.401482 26:0.432680 27:2.667136 29:0.493819 32:0.573815 33:0.599316 34:0.601095 38:1.235996 42:0.664670 44:0.686222 60:0.798616 64:0.805275 74:0.866743 75:0.869891 90:0.956603 91:0.957650 108:3.150229 115:2.177111 141:1.209520 163:2.573456 174:2.645868 190:1.374023 291:1.677025 300:3.382515 323:1.752954 425:5.926553 450:2.026049 484:2.081569 490:2.087109 583:2.259480 652:2.376026 684:4.861286 766:2.563801 853:2.686631 928:5.575382 1173:6.221640 1184:3.122486 1269:9.684842 4277:5.232373

Extracted Keys: These represent the words themselves. Even tho these are numbers, we will be treating them as if they are text
['0', '1', '2', '3', '4', '5', '6', '7', '8', '10', '11', '12', '19', '20', '24', '26', '27', '29', '32', '33', '34', '38', '42', '44', '60', '64', '74', '75', '90', '91', '108', '115', '141', '163', '

In [9]:
'''
This cell constructs a list of dictionaries. Each dictionary represents the 
features column of one of the 15539 examples in the dataset
'''
feat_dicts = []
for i in range(len(train)):
    line_dict = {}
    line = train['features'][i]
    keys = re.findall(r'(\d+):', line)
    values = re.findall(r'\d+:(\d+\.\d+)', line)
    for i in range(len(keys)):
        line_dict[int(keys[i])] = float(values[i])
    feat_dicts.append(line_dict)

In [10]:
len(feat_dicts)

15539

What we need next is to write a function that turns each of these feature dictionaries into sparse vector representations. All we need to go do is, for each dictionary, get all the  keys not already in there and set their values to 0

we can probably just do that by modifying the code above


In [11]:
'''
As an example, here is the dictionary for the example at index 0. 
Every key represents an integer tokenization of a word from the document that the example represents
Every associated value is the corresponding TFIDF weight (https://en.wikipedia.org/wiki/Tf%E2%80%93idf)
'''
feat_dicts[0]

{0: 0.084556,
 1: 0.138594,
 2: 0.094304,
 3: 0.195764,
 4: 0.612552,
 5: 0.106491,
 6: 0.137765,
 7: 0.145839,
 8: 0.30461,
 10: 0.465152,
 11: 0.18293,
 12: 0.193397,
 19: 2.17416,
 20: 0.728684,
 24: 0.401482,
 26: 0.43268,
 27: 2.667136,
 29: 0.493819,
 32: 0.573815,
 33: 0.599316,
 34: 0.601095,
 38: 1.235996,
 42: 0.66467,
 44: 0.686222,
 60: 0.798616,
 64: 0.805275,
 74: 0.866743,
 75: 0.869891,
 90: 0.956603,
 91: 0.95765,
 108: 3.150229,
 115: 2.177111,
 141: 1.20952,
 163: 2.573456,
 174: 2.645868,
 190: 1.374023,
 291: 1.677025,
 300: 3.382515,
 323: 1.752954,
 425: 5.926553,
 450: 2.026049,
 484: 2.081569,
 490: 2.087109,
 583: 2.25948,
 652: 2.376026,
 684: 4.861286,
 766: 2.563801,
 853: 2.686631,
 928: 5.575382,
 1173: 6.22164,
 1184: 3.122486,
 1269: 9.684842,
 4277: 5.232373}

In [24]:
'''
This cell constructs a dictionary
Each key represents the index of an example in train
The associatec value is a set (we have chosen set for ease of membership testing later on) which contains 
all the labels that are associated with the corresponding example
'''
label_dict = {}
for i in range(len(train)):
    labels = train['labels'][i]
    label_dict[i] = list(np.array(re.findall(r'(\d+)', labels)).astype('int'))

In [25]:
label_dict

{0: [446, 521, 1149, 1249, 1265, 1482],
 1: [78, 80, 85, 86],
 2: [457, 577, 579, 640, 939, 1158],
 3: [172, 654, 693, 1704],
 4: [403, 508, 1017, 1052, 1731, 3183],
 5: [174, 379, 380, 381, 449, 493, 677, 1328],
 6: [592, 595, 617, 694, 923],
 7: [446, 499, 500, 671, 804, 1210],
 8: [64, 180, 611],
 9: [505, 1098, 1310, 1818, 3263],
 10: [529, 531, 1396, 1540, 1699],
 11: [208, 210, 211, 418],
 12: [25, 374, 693, 878, 1022, 2149],
 13: [141, 499, 500, 548, 1021],
 14: [426, 955, 2256],
 15: [106, 113, 114, 1824, 2034],
 16: [1534, 1556, 2244, 2785],
 17: [3, 297, 430, 1135],
 18: [36, 64, 242, 295, 792, 1543],
 19: [160, 797, 798, 996, 1518],
 20: [592, 595, 609, 678, 1085],
 21: [299, 731, 733, 1190, 2418],
 22: [324, 906, 1983, 2090, 2537],
 23: [41, 731, 1983, 2036],
 24: [393, 954, 1591, 3532],
 25: [103, 575, 591, 640],
 26: [413, 962, 964, 1022, 1458],
 27: [145, 250, 285, 427, 1461, 1462],
 28: [26, 59, 191, 592, 1019, 1085, 1109],
 29: [338, 624, 659, 1970],
 30: [89, 90, 418,

In [173]:
#Calculate label Cardinality
#this is the average number of labels for each example
N = len(label_dict)
cardinality = 0
for i in range(N):
    cardinality+=len(label_dict[i])
cardinality/=N
print('Label Cardinality: ', cardinality)

Label Cardinality:  5.314820773537551


In [180]:
#Calculate Label Density
#L is the total number of labels, which is given to us
L = 3993
#The density is simply the cardinality divided by L
density = cardinality/L
print('Label Density: ', density)

Label Density:  0.0013310345037659782


We observe that the densoity is very low. This is not surprising given the large number of labels that we have in this data. 