### Parsing and Baseline

**Importing Libraries**

In [0]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from scipy import sparse
from skmultilearn.adapt import MLkNN

**Importing the Data**

In [71]:
# mount your Google Drive
from google.colab import drive
drive.mount('/drive/')

Drive already mounted at /drive/; to attempt to forcibly remount, call drive.mount("/drive/", force_remount=True).


In [72]:
path = '/drive/My Drive/ML Project/DATA'
data = pd.read_csv(path + '/train.csv')
data.head()

Unnamed: 0,ex_id,labels,features
0,0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
1,1,78808586,0:0.050734 1:0.762265 2:0.754431 3:0.065255 4:...
2,2,4575775796409391158,0:0.101468 1:0.138594 2:0.377215 3:0.130509 4:...
3,3,1726546931704,0:0.186024 1:0.346484 2:0.141456 3:0.195764 4:...
4,4,4035081017105217313183,0:0.135290 1:0.277187 2:0.141456 3:0.065255 4:...


**Functions**

In [0]:
def data_format(train):
    '''
    This code block constructs a list of dictionaries. Each dictionary represents the 
    features column of one of the 15539 examples in the dataset
    '''
    feat_dicts = []
    for i in range(len(train)):
        line_dict = {}
        line = train['features'][i]
        keys = re.findall(r'(\d+):', line)
        values = re.findall(r'\d+:(\d+\.\d+)', line)
        for i in range(len(keys)):
            line_dict[int(keys[i])] = float(values[i])
        feat_dicts.append(line_dict)
        
    '''
    This code block constructs a dictionary
    Each key represents the index of an example in train
    The associatec value is a set (we have chosen set for ease of membership testing later on) which contains 
    all the labels that are associated with the corresponding example
    '''
    label_dict = {}
    for i in range(len(train)):
        labels = train['labels'][i]
        label_dict[i] = list(np.array(re.findall(r'(\d+)', labels)).astype('int'))
        
    return feat_dicts, label_dict

In [0]:
'''
This function will take a dictionary as input, where each key is an instance 
and each dictionary[key] corresponds to a list of labels for each instance. 
This dictionary will later be tranformed into a sparce matrix, where each 
column represents a label value so that there exists a column for each integer 
between 0 and max(label). This sparce representation is particularly 
inconvenient and inefficient for our data, given that there are hundreds of 
thousands of integers that don't appear in our label set. To limit the size of 
our sparce matrix and to preserve interpretation of our model, we will recode 
abnormally large labels.
'''
def recodeLargeLabels(dictn):
   
  for key in dictn:

    labelList = []

    for label in dictn[key]:
      
      if label > 3993:

        if label == 16911: label = 3994
        if label == 33823: label = 3995
        if label == 50734: label = 3996
        if label == 69297: label = 3997
        if label == 84556: label = 3998
        if label == 101468: label = 3999
        if label == 118379: label = 4000
        if label == 135290: label = 4001
        if label == 152201: label = 4002
        if label == 169113: label = 4003
        if label == 202935: label = 4004
        if label == 219847: label = 4005
        if label == 236758: label = 4006
        if label == 338225: label = 4007
        if label == 372048: label = 4008
        if label == 439693: label = 4009
        if label == 507338: label = 4010

    labelList.append(label)

    # if any labels have been updated
    if dictn[key] != labelList:
      # update dictionary key
      change = {key: labelList}
      dictn.update(change)

  return dictn

#Calculate label Cardinality
#this is the average number of labels for each example
N = len(label_dict)
cardinality = 0
for i in range(N):
    cardinality+=len(label_dict[i])
cardinality/=N
print('Label Cardinality: ', cardinality)

In [0]:
#Calculate Label Density
#L is the total number of labels, which is given to us
L = 3993
#The density is simply the cardinality divided by L
density = cardinality/L
print('Label Density: ', density)

NameError: ignored

We observe that the densoity is very low. This is not surprising given the large number of labels that we have in this data. 

In [0]:
data_copy = data.copy()
train_df = data_copy.sample(frac=0.8, random_state=0)
val_df = data_copy.drop(train_df.index)

In [76]:
train_df.head()

Unnamed: 0,ex_id,labels,features
6848,6848,27261182299213851416,0:0.050734 1:0.242539 2:0.141456 3:0.261019 4:...
1283,1283,5522422879823412529,0:0.050734 1:0.103945 2:0.094304 3:0.130509 4:...
10198,10198,4992008207921942195,0:0.050734 1:0.519726 2:0.801583 3:0.391528 4:...
11060,11060,114488658675694809834,0:1.488192 1:0.970155 2:2.923419 3:0.326274 4:...
5590,5590,21021122112212,0:0.033823 1:0.242539 2:0.094304 3:0.130509 4:...


In [77]:
val_df.head()

Unnamed: 0,ex_id,labels,features
0,0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
13,13,1414995005481021,0:0.338225 1:1.247342 2:1.650317 3:0.717802 4:...
21,21,29973173311902418,0:0.033823 1:0.554374 2:0.094304 3:0.326274 4:...
25,25,103575591640,0:0.050734 1:0.034648 2:1.084494 4:0.262522 5:...
26,26,41396296410221458,0:0.067645 1:0.242539 2:0.094304 3:0.065255 4:...


In [0]:
train = train_df.reset_index(drop = True)
val = val_df.reset_index(drop = True)

In [79]:
train.head()

Unnamed: 0,ex_id,labels,features
0,6848,27261182299213851416,0:0.050734 1:0.242539 2:0.141456 3:0.261019 4:...
1,1283,5522422879823412529,0:0.050734 1:0.103945 2:0.094304 3:0.130509 4:...
2,10198,4992008207921942195,0:0.050734 1:0.519726 2:0.801583 3:0.391528 4:...
3,11060,114488658675694809834,0:1.488192 1:0.970155 2:2.923419 3:0.326274 4:...
4,5590,21021122112212,0:0.033823 1:0.242539 2:0.094304 3:0.130509 4:...


In [80]:
val.head()

Unnamed: 0,ex_id,labels,features
0,0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
1,13,1414995005481021,0:0.338225 1:1.247342 2:1.650317 3:0.717802 4:...
2,21,29973173311902418,0:0.033823 1:0.554374 2:0.094304 3:0.326274 4:...
3,25,103575591640,0:0.050734 1:0.034648 2:1.084494 4:0.262522 5:...
4,26,41396296410221458,0:0.067645 1:0.242539 2:0.094304 3:0.065255 4:...


In [0]:
x_train, y_train = data_format(train)

In [82]:
len(x_train)

12431

In [0]:
x_val, y_val = data_format(val)

In [84]:
len(x_val)

3108

In [0]:
y_train

In [0]:
x_train_s =sparse.lil_matrix((len(x_train), 5000))
for i in range(len(x_train)):
    for j in list(x_train[i].keys()):
        x_train_s[i,j] = x_train[i][j]

In [0]:
pd.DataFrame.sparse.from_spmatrix(x_train_s)

In [0]:
x_val_s =sparse.lil_matrix((len(x_val), 5000))
for i in range(len(x_val)):
    for j in list(x_val[i].keys()):
        x_val_s[i,j] = x_val[i][j]

In [0]:
pd.DataFrame.sparse.from_spmatrix(x_val_s)

In [92]:
for i in y_train:
    for j in y_train[i]:
        if j > 3993:
            print("{0} {1}".format(i, j))

712 50734
908 101468
1350 33823
1425 338225
1928 372048
2160 101468
2565 439693
2642 507338
3906 135290
5397 33823
5605 50734
5700 118379
5758 50734
6345 202935
6780 33823
7374 50734
7848 33823
7945 16911
8162 152201
9394 33823
10910 169113


In [93]:
for i in y_val:
    for j in y_val[i]:
        if j > 3993:
            print("{0} {1}".format(i, j))

24 84556
310 219847
1374 33823
1448 50734
2118 84556
2627 236758
2828 69297


In [94]:
# recode large labels
y_train = recodeLargeLabels(y_train)
y_val = recodeLargeLabels(y_val)

print('Training Updates')
for i in y_train:
    for j in y_train[i]:
        if j > 3993:
            print("{0} {1}".format(i, j))

print('\nValidation Updates')
for i in y_val:
    for j in y_val[i]:
        if j > 3993:
            print("{0} {1}".format(i, j))

Training Updates
712 3996
908 3999
1350 3995
1425 4007
1928 4008
2160 3999
2565 4009
2642 4010
3906 4001
5397 3995
5605 3996
5700 4000
5758 3996
6345 4004
6780 3995
7374 3996
7848 3995
7945 3994
8162 4002
9394 3995
10910 4003

Validation Updates
24 3998
310 4005
1374 3995
1448 3996
2118 3998
2627 4006
2828 3997


In [0]:
y_train_s=sparse.lil_matrix((len(y_train),4011))
for i in y_train:
    for j in y_train[i]:
        y_train_s[i,j] = 1

In [0]:
pd.DataFrame.sparse.from_spmatrix(y_train_s)

In [0]:
y_val_s=sparse.lil_matrix((len(y_train),4011))
for i in y_val:
    for j in y_val[i]:
          y_val_s[i,j] = 1

In [99]:
pd.DataFrame.sparse.from_spmatrix(y_val_s)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,3971,3972,3973,3974,3975,3976,3977,3978,3979,3980,3981,3982,3983,3984,3985,3986,3987,3988,3989,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999,4000,4001,4002,4003,4004,4005,4006,4007,4008,4009,4010
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
classifier = MLkNN()
classifier.fit(x_train_s, y_train_s)
predictions = classifier.predict(x_val_s)
classifier.score(x_val_s, y_val_s)

KeyboardInterrupt: 