### Parsing and Baseline

**Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from scipy import sparse
from skmultilearn.adapt import MLkNN

**Importing the Data**

In [2]:
data = pd.read_csv('../train.csv')
data.head()

Unnamed: 0,ex_id,labels,features
0,0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
1,1,78808586,0:0.050734 1:0.762265 2:0.754431 3:0.065255 4:...
2,2,4575775796409391158,0:0.101468 1:0.138594 2:0.377215 3:0.130509 4:...
3,3,1726546931704,0:0.186024 1:0.346484 2:0.141456 3:0.195764 4:...
4,4,4035081017105217313183,0:0.135290 1:0.277187 2:0.141456 3:0.065255 4:...


**Functions**

In [3]:
def data_format(train):
    '''
    This code block constructs a list of dictionaries. Each dictionary represents the 
    features column of one of the 15539 examples in the dataset
    '''
    feat_dicts = []
    for i in range(len(train)):
        line_dict = {}
        line = train['features'][i]
        keys = re.findall(r'(\d+):', line)
        values = re.findall(r'\d+:(\d+\.\d+)', line)
        for i in range(len(keys)):
            line_dict[int(keys[i])] = float(values[i])
        feat_dicts.append(line_dict)
        
    '''
    This code block constructs a dictionary
    Each key represents the index of an example in train
    The associatec value is a set (we have chosen set for ease of membership testing later on) which contains 
    all the labels that are associated with the corresponding example
    '''
    label_dict = {}
    for i in range(len(train)):
        labels = train['labels'][i]
        label_dict[i] = list(np.array(re.findall(r'(\d+)', labels)).astype('int'))
        
    return feat_dicts, label_dict

#Calculate label Cardinality
#this is the average number of labels for each example
N = len(label_dict)
cardinality = 0
for i in range(N):
    cardinality+=len(label_dict[i])
cardinality/=N
print('Label Cardinality: ', cardinality)

In [None]:
#Calculate Label Density
#L is the total number of labels, which is given to us
L = 3993
#The density is simply the cardinality divided by L
density = cardinality/L
print('Label Density: ', density)

We observe that the densoity is very low. This is not surprising given the large number of labels that we have in this data. 

In [4]:
data_copy = data.copy()
train_df = data_copy.sample(frac=0.8, random_state=0)
val_df = data_copy.drop(train_df.index)

In [5]:
train_df.head()

Unnamed: 0,ex_id,labels,features
6848,6848,27261182299213851416,0:0.050734 1:0.242539 2:0.141456 3:0.261019 4:...
1283,1283,5522422879823412529,0:0.050734 1:0.103945 2:0.094304 3:0.130509 4:...
10198,10198,4992008207921942195,0:0.050734 1:0.519726 2:0.801583 3:0.391528 4:...
11060,11060,114488658675694809834,0:1.488192 1:0.970155 2:2.923419 3:0.326274 4:...
5590,5590,21021122112212,0:0.033823 1:0.242539 2:0.094304 3:0.130509 4:...


In [6]:
val_df.head()

Unnamed: 0,ex_id,labels,features
0,0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
13,13,1414995005481021,0:0.338225 1:1.247342 2:1.650317 3:0.717802 4:...
21,21,29973173311902418,0:0.033823 1:0.554374 2:0.094304 3:0.326274 4:...
25,25,103575591640,0:0.050734 1:0.034648 2:1.084494 4:0.262522 5:...
26,26,41396296410221458,0:0.067645 1:0.242539 2:0.094304 3:0.065255 4:...


In [7]:
train = train_df.reset_index(drop = True)
val = val_df.reset_index(drop = True)

In [8]:
train.head()

Unnamed: 0,ex_id,labels,features
0,6848,27261182299213851416,0:0.050734 1:0.242539 2:0.141456 3:0.261019 4:...
1,1283,5522422879823412529,0:0.050734 1:0.103945 2:0.094304 3:0.130509 4:...
2,10198,4992008207921942195,0:0.050734 1:0.519726 2:0.801583 3:0.391528 4:...
3,11060,114488658675694809834,0:1.488192 1:0.970155 2:2.923419 3:0.326274 4:...
4,5590,21021122112212,0:0.033823 1:0.242539 2:0.094304 3:0.130509 4:...


In [9]:
val.head()

Unnamed: 0,ex_id,labels,features
0,0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
1,13,1414995005481021,0:0.338225 1:1.247342 2:1.650317 3:0.717802 4:...
2,21,29973173311902418,0:0.033823 1:0.554374 2:0.094304 3:0.326274 4:...
3,25,103575591640,0:0.050734 1:0.034648 2:1.084494 4:0.262522 5:...
4,26,41396296410221458,0:0.067645 1:0.242539 2:0.094304 3:0.065255 4:...


In [10]:
x_train, y_train = data_format(train)

In [11]:
len(x_train)

12431

In [12]:
x_val, y_val = data_format(val)

In [13]:
len(x_val)

3108

In [14]:
y_train

{0: [272, 611, 822, 992, 1385, 1416],
 1: [55, 224, 228, 798, 2341, 2529],
 2: [499, 2008, 2079, 2194, 2195],
 3: [114, 488, 658, 675, 694, 809, 834],
 4: [210, 211, 2211, 2212],
 5: [32, 105, 207],
 6: [26, 87, 191, 194, 292, 958],
 7: [363, 499, 500, 980, 1030, 3255],
 8: [1521, 1987, 2726, 2798, 2799],
 9: [128, 507, 591, 885, 1061, 1062],
 10: [308, 430, 480, 540, 3293],
 11: [20, 25, 26, 27, 125, 148],
 12: [358, 446, 637, 822, 887, 992],
 13: [1050, 1308, 1338, 1339, 1340],
 14: [116, 342, 463, 2809, 2903, 2904],
 15: [271, 343, 955],
 16: [531, 596, 1396, 1398, 1399],
 17: [224, 228, 445, 797, 798, 996, 2341],
 18: [25, 446, 499, 500, 804, 1044],
 19: [64, 169, 180, 250, 941],
 20: [691, 1224, 1226, 1262, 1339, 2064],
 21: [96, 351, 427, 941, 1817],
 22: [306, 660],
 23: [120, 1750, 1952],
 24: [342, 465, 691, 822, 1779, 2809],
 25: [299, 731, 733],
 26: [28, 62, 69, 99],
 27: [9, 15, 17, 32, 156, 447],
 28: [224, 228, 285, 797, 798, 2341],
 29: [46, 198, 814, 2635],
 30: [297, 

In [15]:
x_train_s =sparse.lil_matrix((len(x_train), 5000))
for i in range(len(x_train)):
    for j in list(x_train[i].keys()):
        x_train_s[i,j] = x_train[i][j]

In [16]:
x_val_s =sparse.lil_matrix((len(x_val), 5000))
for i in range(len(x_val)):
    for j in list(x_val[i].keys()):
        x_val_s[i,j] = x_val[i][j]

In [31]:
for i in y_train:
    for j in y_train[i]:
        if j > 3993:
            print("{0} {1}".format(i, j))

712 50734
908 101468
1350 33823
1425 338225
1928 372048
2160 101468
2565 439693
2642 507338
3906 135290
5397 33823
5605 50734
5700 118379
5758 50734
6345 202935
6780 33823
7374 50734
7848 33823
7945 16911
8162 152201
9394 33823
10910 169113


In [30]:
for i in y_val:
    for j in y_val[i]:
        if j > 3993:
            print("{0} {1}".format(i, j))

24 84556
310 219847
1374 33823
1448 50734
2118 84556
2627 236758
2828 69297


In [33]:
y_train_s=sparse.lil_matrix((len(y_train),3993))
for i in y_train:
    for j in y_train[i]:
        if j > 3993:
            continue
        else:
            y_train_s[i,j] = 1

In [34]:
pd.DataFrame.sparse.from_spmatrix(y_train_s)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3983,3984,3985,3986,3987,3988,3989,3990,3991,3992
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
y_val_s=sparse.lil_matrix((len(y_train),3993))
for i in y_val:
    for j in y_val[i]:
        if j > 3993:
            continue
        else:
            y_val_s[i,j] = 1

In [36]:
pd.DataFrame.sparse.from_spmatrix(y_val_s)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3983,3984,3985,3986,3987,3988,3989,3990,3991,3992
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
classifier = MLkNN()
classifier.fit(x_train_s, y_train_s)
predictions = classifier.predict(x_val_s)
classifier.score(x_val_s, y_val_s)

KeyboardInterrupt: 