In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

root_dir = 'e:/mlprojectdata/'
train_labels = pd.read_csv(root_dir + 'train.csv')
train_labels.head()

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18


In [2]:
label_names = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}

reverse_train_labels = dict((v,k) for k,v in label_names.items())

def fill_targets(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    for num in row.Target:
        name = label_names[int(num)]
        row.loc[name] = 1
    return row

In [3]:
for key in label_names.keys():
    train_labels[label_names[key]] = 0
train_labels = train_labels.apply(fill_targets, axis=1)
train_labels.head()

Unnamed: 0,Id,Target,Nucleoplasm,Nuclear membrane,Nucleoli,Nucleoli fibrillar center,Nuclear speckles,Nuclear bodies,Endoplasmic reticulum,Golgi apparatus,...,Microtubule organizing center,Centrosome,Lipid droplets,Plasma membrane,Cell junctions,Mitochondria,Aggresome,Cytosol,Cytoplasmic bodies,Rods & rings
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,"[16, 0]",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,"[7, 1, 2, 0]",1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,[5],0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,[1],0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,[18],0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [91]:
train_labels[3106:3107]

Unnamed: 0,Id,Target,Nucleoplasm,Nuclear membrane,Nucleoli,Nucleoli fibrillar center,Nuclear speckles,Nuclear bodies,Endoplasmic reticulum,Golgi apparatus,...,Microtubule organizing center,Centrosome,Lipid droplets,Plasma membrane,Cell junctions,Mitochondria,Aggresome,Cytosol,Cytoplasmic bodies,Rods & rings
3106,19e9765e-bbc3-11e8-b2bc-ac1f6b6435d0,"[16, 14, 17]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
target_counts = train_labels.drop(["Id", "Target"],axis=1).sum(axis=0).sort_values(ascending=False)
target_counts

Nucleoplasm                      12885
Cytosol                           8228
Plasma membrane                   3777
Nucleoli                          3621
Mitochondria                      2965
Golgi apparatus                   2822
Nuclear bodies                    2513
Nuclear speckles                  1858
Nucleoli fibrillar center         1561
Centrosome                        1482
Nuclear membrane                  1254
Intermediate filaments            1093
Microtubules                      1066
Endoplasmic reticulum             1008
Microtubule organizing center      902
Cell junctions                     802
Actin filaments                    688
Focal adhesion sites               537
Cytokinetic bridge                 530
Cytoplasmic bodies                 328
Aggresome                          322
Mitotic spindle                    210
Lipid droplets                     172
Peroxisomes                         53
Endosomes                           45
Lysosomes                

In [134]:
def data_csv_generator(data_csv, save_path, least_num=100, train_rate=2/3, total_rate = 0.1):
    counts = data_csv.drop(["Id", "Target"],axis=1).sum(axis=0).sort_values(ascending=True)
    train_targets = pd.DataFrame(columns = data_csv.columns)
    test_targets = pd.DataFrame(columns = data_csv.columns)
    
    for i in range(len(counts)):
        if counts.values[i] > least_num:
            break
        group = data_csv[data_csv[counts.index[i]] > 0]
        #group = data_csv.groupby[counts.index[i]].get_group(1)
        train_target = group.sample(frac = train_rate)
        test_target = group.drop(train_target.index)
        
        train_targets = pd.concat([train_targets,train_target])
        test_targets = pd.concat([test_targets,test_target])
        print('column:'+str(counts.index[i])+' proceeded.') 
        
    # delete duplicated index
    all_targets = pd.concat([train_targets,test_targets])
    all_targets = all_targets.loc[~all_targets.index.duplicated(keep='first')]

    rest = data_csv.drop(all_targets.index).sample(frac = total_rate)
    rest_train = rest.sample(frac = train_rate)
    rest_test = rest.drop(rest_train.index)
    
    train_targets = pd.concat([train_targets,rest_train])
    test_targets = pd.concat([test_targets,rest_test])
    # delete duplicated index
    train_targets = train_targets.loc[~train_targets.index.duplicated(keep='first')]
    test_targets = test_targets.loc[~test_targets.index.duplicated(keep='first')]
    # suffle the output data
    train_targets = train_targets.sample(frac = 1).reset_index(drop = True)
    test_targets = test_targets.sample(frac = 1).reset_index(drop = True)
    
    
    train_targets.to_csv(save_path + 'random_trainset.csv',index = False)
    test_targets.to_csv(save_path + 'random_testset.csv',index = False)
    
    return train_targets,test_targets


In [135]:
save_path = 'e:/mlprojectdata/samples/'
sample_trainset,sample_testset = data_csv_generator(train_labels, save_path)

column:Rods & rings proceeded.
column:Microtubule ends proceeded.
column:Lysosomes proceeded.
column:Endosomes proceeded.
column:Peroxisomes proceeded.


In [141]:
train_counts = sample_trainset.drop(["Id", "Target"],axis=1).sum(axis=0).sort_values(ascending=False)
print(sample_trainset.shape)
train_counts

(2156, 30)


Nucleoplasm                      855.0
Cytosol                          553.0
Nucleoli                         257.0
Plasma membrane                  231.0
Mitochondria                     198.0
Golgi apparatus                  195.0
Nuclear bodies                   183.0
Nuclear speckles                 121.0
Centrosome                        98.0
Nucleoli fibrillar center         91.0
Endoplasmic reticulum             88.0
Nuclear membrane                  86.0
Microtubules                      77.0
Intermediate filaments            72.0
Cell junctions                    64.0
Actin filaments                   50.0
Microtubule organizing center     47.0
Endosomes                         37.0
Cytokinetic bridge                35.0
Peroxisomes                       35.0
Focal adhesion sites              31.0
Lysosomes                         27.0
Cytoplasmic bodies                20.0
Aggresome                         16.0
Microtubule ends                  14.0
Mitotic spindle          

In [140]:
test_counts = sample_testset.drop(["Id", "Target"],axis=1).sum(axis=0).sort_values(ascending=False)
print(sample_testset.shape)
test_counts

(1083, 30)


Nucleoplasm                      436.0
Cytosol                          278.0
Plasma membrane                  129.0
Nucleoli                         118.0
Nuclear bodies                    97.0
Mitochondria                      96.0
Golgi apparatus                   92.0
Nuclear speckles                  63.0
Nucleoli fibrillar center         59.0
Centrosome                        51.0
Endoplasmic reticulum             42.0
Nuclear membrane                  41.0
Microtubule organizing center     30.0
Microtubules                      30.0
Cell junctions                    28.0
Actin filaments                   28.0
Intermediate filaments            28.0
Endosomes                         23.0
Cytokinetic bridge                18.0
Peroxisomes                       18.0
Focal adhesion sites              17.0
Lysosomes                         16.0
Aggresome                         10.0
Cytoplasmic bodies                 9.0
Microtubule ends                   7.0
Lipid droplets           