In [21]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
#Getting rid of cases that have no labels.Part of data cleaning essentially. Could try to do something fancy with missing
#label but they are only 20 cases from like 20k so no point in spending time on that IMO.
train_skiprows = [
    94,
    253,
    510,
    1528,
    1940,
    1954,
    4030,
    4427,
    4644,
    4728,
    5232,
    5763,
    6296,
    6334,
    6704,
    7084,
    9478,
    9676,
    10000,
    10489,
    10737,
    10911,
    11675,
    12281,
    12600,
    13148,
    14168,
    14723,
]


In [22]:
#Read data. Csv so seperate with ','.Skip the rows that dont have columns.
raw_data = pd.read_csv('../data/raw/train.csv',sep = ',',skiprows=train_skiprows)

In [23]:
#Put labels into a list (one big string atm) with split and covert them to integers.
raw_data["labels"] = (raw_data["labels"].str.split(",").apply(lambda x: list(map(int, x))))

In [24]:
raw_data.head()

Unnamed: 0,ex_id,labels,features
0,0,"[446, 521, 1149, 1249, 1265, 1482]",0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
1,1,"[78, 80, 85, 86]",0:0.050734 1:0.762265 2:0.754431 3:0.065255 4:...
2,2,"[457, 577, 579, 640, 939, 1158]",0:0.101468 1:0.138594 2:0.377215 3:0.130509 4:...
3,3,"[172, 654, 693, 1704]",0:0.186024 1:0.346484 2:0.141456 3:0.195764 4:...
4,4,"[403, 508, 1017, 1052, 1731, 3183]",0:0.135290 1:0.277187 2:0.141456 3:0.065255 4:...


In [25]:
#Convert features (a string atm) into a dictionary where key is a integer and the value is a float.
#First split converts each row into a list, then within each list we loop the elements and seperate by : for which the first 
#value is the key and the second is the value.
raw_data["features"] = (raw_data["features"].str.split(" ").apply(lambda x: {int(i.split(":")[0])
                                                                             : float(i.split(":")[1]) for i in x}))

In [7]:
raw_data.head()

Unnamed: 0,ex_id,labels,features
0,0,"[446, 521, 1149, 1249, 1265, 1482]","{0: 0.084556, 1: 0.138594, 2: 0.094304, 3: 0.1..."
1,1,"[78, 80, 85, 86]","{0: 0.050734, 1: 0.762265, 2: 0.754431, 3: 0.0..."
2,2,"[457, 577, 579, 640, 939, 1158]","{0: 0.101468, 1: 0.138594, 2: 0.377215, 3: 0.1..."
3,3,"[172, 654, 693, 1704]","{0: 0.186024, 1: 0.346484, 2: 0.141456, 3: 0.1..."
4,4,"[403, 508, 1017, 1052, 1731, 3183]","{0: 0.13529, 1: 0.277187, 2: 0.141456, 3: 0.06..."


In [26]:
#Converts the dictionary corresponding to each row into a list that has the feature keys in one list and the values in another
raw_data["features"] = raw_data["features"].apply(lambda x: [list(x.keys()), list(x.values())])

In [30]:
raw_data["labels"]

0               [446, 521, 1149, 1249, 1265, 1482]
1                                 [78, 80, 85, 86]
2                  [457, 577, 579, 640, 939, 1158]
3                            [172, 654, 693, 1704]
4               [403, 508, 1017, 1052, 1731, 3183]
                           ...                    
15506              [592, 595, 617, 694, 724, 1099]
15507    [64, 180, 342, 418, 903, 941, 1997, 2345]
15508                              [284, 314, 343]
15509             [165, 260, 317, 683, 1218, 1501]
15510                         [203, 204, 681, 686]
Name: labels, Length: 15511, dtype: object

In [31]:

#test = MultiLabelBinarizer().fit_transform(raw_data["labels"])

In [9]:
NUM_FEATURES = 5000
NUM_CLASSES = 3993

#Create the two sparse matrices, one for the features and one for the classes.
temp = np.zeros((raw_data.shape[0], NUM_FEATURES))
features = pd.DataFrame(temp)

temp = np.zeros((raw_data.shape[0], NUM_CLASSES))
labels = pd.DataFrame(temp)

In [10]:
raw_data.head()

Unnamed: 0,ex_id,labels,features
0,0,"[446, 521, 1149, 1249, 1265, 1482]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 19, 2..."
1,1,"[78, 80, 85, 86]","[[0, 1, 2, 3, 4, 6, 10, 11, 13, 15, 16, 18, 21..."
2,2,"[457, 577, 579, 640, 939, 1158]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
3,3,"[172, 654, 693, 1704]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
4,4,"[403, 508, 1017, 1052, 1731, 3183]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14..."


In [11]:
#Puts the weight of each feature per example into the empty sparse matrix.
def create_features(row):
    features.loc[row.name, row["features"][0]] = row["features"][1]

#Puts a 1 on each row of the matrix that has label at that column.
def create_labels(row):
    labels.loc[row.name, row["labels"]] = 1

In [12]:
raw_data.apply(lambda x: create_features(x), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
15506    None
15507    None
15508    None
15509    None
15510    None
Length: 15511, dtype: object

In [13]:
raw_data.apply(lambda x: create_labels(x), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
15506    None
15507    None
15508    None
15509    None
15510    None
Length: 15511, dtype: object

In [14]:
Path("../data/expanded").mkdir(parents=True, exist_ok=True)

In [15]:
file_prefix = "train"
features.to_csv( f"../data/expanded/{file_prefix}_features.csv", index=False, header=False)
labels.to_csv(f"../data/expanded/{file_prefix}_labels.csv", index=False, header=False)
print(f"Saved dataframes as {file_prefix}_features.csv and {file_prefix}_labels.csv")

Saved dataframes as train_features.csv and train_labels.csv


In [20]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.084556,0.138594,0.094304,0.195764,0.612552,0.106491,0.137765,0.145839,0.30461,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.050734,0.762265,0.754431,0.065255,0.35003,0.0,0.137765,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.101468,0.138594,0.377215,0.130509,0.175015,0.745434,0.137765,1.020873,1.218441,0.309297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.186024,0.346484,0.141456,0.195764,0.262522,0.425962,0.551062,0.437517,0.60922,0.618594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.13529,0.277187,0.141456,0.065255,1.137597,0.106491,0.137765,0.583356,0.152305,0.154649,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
test = 'five'

In [13]:
while type(test) is not int:
    print("ss")
    test = 5

ss


In [14]:
s = '5'

In [15]:
s.isdigit()

True