In [1]:
from collections import Counter

import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np
from IPython.display import display

from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.base import SamplerMixin

print(__doc__)


Automatically created module for IPython interactive environment


In [2]:
def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3,
                   class_sep=0.8, n_clusters=1):
    return make_classification(n_samples=n_samples, n_features=2,
                               n_informative=2, n_redundant=0, n_repeated=0,
                               n_classes=n_classes,
                               n_clusters_per_class=n_clusters,
                               weights=list(weights),
                               class_sep=class_sep, random_state=0)

def plot_resampling(X, y, sampling, ax):
    X_res, y_res = sampling.fit_sample(X, y)
    ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
    # make nice plotting
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
#     return Counter(y_res)


def plot_decision_function(X, y, clf, ax):
    plot_step = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.4)
    ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k')

In [3]:
df = pd.read_csv('data/train.csv').transpose()
df = df.reset_index()
# display(raw)
new_header = df.iloc[0] #grab the first row for the header
df = df[1:] #take the data less the header row
df.columns = new_header #set the header row as the df header
df['label'] = [0,1,1,1,2,2,3,3,3,3]
display(df)

Unnamed: 0,label_name,0610005C13Rik,0610007P14Rik,0610009B22Rik,0610009L18Rik,0610009O20Rik,0610010B08Rik,0610010F05Rik,0610010K14Rik,0610011F06Rik,...,Zwilch,Zwint,Zxda,Zxdb,Zxdc,Zyg11b,Zyx,Zzef1,Zzz3,label
1,dermal,0,188,30,55,1221,23,0,1593,0,...,115,334,176,540,229,274,1446,183,292,0
2,neuralTube,1,250,137,13,761,41,0,485,0,...,368,2482,147,235,334,462,305,215,588,1
3,neuralTube.1,9,418,149,22,1298,101,0,769,0,...,604,3528,234,387,461,638,509,353,1066,1
4,neuralTube.2,1,602,478,24,882,165,1215,797,0,...,365,4956,118,87,265,597,511,96,486,1
5,BAT,26,2704,211,32,2090,184,0,2188,0,...,1207,9029,349,1170,719,1412,4542,906,1397,2
6,BAT.1,18,1877,206,44,1895,118,0,1942,0,...,1041,7509,296,941,577,1476,2994,748,947,2
7,muscle,26,1178,243,41,3447,280,0,2204,0,...,2125,4322,401,1453,1215,1347,6908,567,1680,3
8,muscle.1,21,1145,355,33,2603,204,0,1899,0,...,2022,4321,437,828,742,1203,3478,288,1386,3
9,muscle.2,0,650,1102,113,955,0,1132,1118,0,...,0,4173,23,809,13,32,1358,0,0,3
10,muscle.3,11,377,797,317,1571,36,302,693,0,...,125,2530,111,39,299,284,1479,102,185,3


In [4]:
ros = RandomOverSampler(random_state=0)
smote = SMOTE(random_state=0)

predictors = [x for x in df.columns if x not in ['label_name', 'label']]
# X = df[predictors]
X = df.drop(['label_name','label'], axis=1)
y = df.label
X_resampled, y_resampled = ros.fit_sample(X.values, y)

# print(sorted(Counter(y_resampled).items()))
print 'X_resampled shape'
print type(X_resampled)
# y_resampled = y_resampled.reshape(16,1)
print y_resampled
print y_resampled.shape


X_resampled shape
<type 'numpy.ndarray'>
[0 1 1 1 2 2 3 3 3 3 0 0 0 1 2 2]
(16,)


In [10]:
print df.columns

df_sampled = pd.DataFrame(data=X_resampled, columns=predictors)

df_sampled['label'] = y_resampled
display(df_sampled)

Index([u'label_name', u'0610005C13Rik', u'0610007P14Rik', u'0610009B22Rik',
       u'0610009L18Rik', u'0610009O20Rik', u'0610010B08Rik', u'0610010F05Rik',
       u'0610010K14Rik', u'0610011F06Rik',
       ...
       u'Zwilch', u'Zwint', u'Zxda', u'Zxdb', u'Zxdc', u'Zyg11b', u'Zyx',
       u'Zzef1', u'Zzz3', u'label'],
      dtype='object', name=0, length=20715)


Unnamed: 0,0610005C13Rik,0610007P14Rik,0610009B22Rik,0610009L18Rik,0610009O20Rik,0610010B08Rik,0610010F05Rik,0610010K14Rik,0610011F06Rik,0610012G03Rik,...,Zwilch,Zwint,Zxda,Zxdb,Zxdc,Zyg11b,Zyx,Zzef1,Zzz3,label
0,0.0,188.0,30.0,55.0,1221.0,23.0,0.0,1593.0,0.0,992.0,...,115.0,334.0,176.0,540.0,229.0,274.0,1446.0,183.0,292.0,0
1,1.0,250.0,137.0,13.0,761.0,41.0,0.0,485.0,0.0,333.0,...,368.0,2482.0,147.0,235.0,334.0,462.0,305.0,215.0,588.0,1
2,9.0,418.0,149.0,22.0,1298.0,101.0,0.0,769.0,0.0,472.0,...,604.0,3528.0,234.0,387.0,461.0,638.0,509.0,353.0,1066.0,1
3,1.0,602.0,478.0,24.0,882.0,165.0,1215.0,797.0,0.0,851.0,...,365.0,4956.0,118.0,87.0,265.0,597.0,511.0,96.0,486.0,1
4,26.0,2704.0,211.0,32.0,2090.0,184.0,0.0,2188.0,0.0,1596.0,...,1207.0,9029.0,349.0,1170.0,719.0,1412.0,4542.0,906.0,1397.0,2
5,18.0,1877.0,206.0,44.0,1895.0,118.0,0.0,1942.0,0.0,1326.0,...,1041.0,7509.0,296.0,941.0,577.0,1476.0,2994.0,748.0,947.0,2
6,26.0,1178.0,243.0,41.0,3447.0,280.0,0.0,2204.0,0.0,1495.0,...,2125.0,4322.0,401.0,1453.0,1215.0,1347.0,6908.0,567.0,1680.0,3
7,21.0,1145.0,355.0,33.0,2603.0,204.0,0.0,1899.0,0.0,990.0,...,2022.0,4321.0,437.0,828.0,742.0,1203.0,3478.0,288.0,1386.0,3
8,0.0,650.0,1102.0,113.0,955.0,0.0,1132.0,1118.0,0.0,4204.0,...,0.0,4173.0,23.0,809.0,13.0,32.0,1358.0,0.0,0.0,3
9,11.0,377.0,797.0,317.0,1571.0,36.0,302.0,693.0,0.0,2133.0,...,125.0,2530.0,111.0,39.0,299.0,284.0,1479.0,102.0,185.0,3


In [23]:
labels = df.label.unique()
factor = 5

print labels
for cell_type in labels:
    print 'replicate cell type: ' + str(cell_type)
    df_cell = df_sampled.loc[df_sampled.label == cell_type]
    df_replicate = pd.DataFrame(columns=df_cell.columns)
    for column in df_cell:
        mean = df_cell[column].mean()
        std = df_cell[column].std()
        if std==0.0:
            print 'replace std'
            std=1.0
        gene_col = np.random.normal(mean,std,(factor, 1))
#         print gene_col
#     TODO: concat axis 1 all gene_col then append to df_cell
        df_replicate[column] =gene_col
    print 'complete replicating'
    display(df_replicate)
    print'//'

[0 1 2 3]
replicate cell type: 0
replace std


ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
# for each label, for each gene, normalize
def multiply(X,y, iteration):
#     print 'mean & std'
#     print X.mean(),X.std()
#     new_X = X
#     new_y = y
#     for i in range(iteration):
#         X_noise = np.random.normal(X.mean(),X.std(),X.shape)
#         temp = X+X_noise
#         new_X = np.vstack((new_X, temp ))
#         new_y = np.vstack((new_y, y ))
#     return new_X, new_y

    

In [18]:
# test = np.random.normal(111,1,(100,123))
# print test
# print test.shape

[[ 111.47584977  112.86284405  112.00006336 ...,  109.97862799  110.2973922
   110.70367461]
 [ 111.16191826  111.48415472  110.38121595 ...,  111.13168858
   111.93882246  109.44709763]
 [ 112.05222915  111.928818    110.54014563 ...,  111.9941992   111.88182201
   109.89999274]
 ..., 
 [ 112.11355915  111.30692191  110.45128332 ...,  111.21837626
   111.38526393  111.50992339]
 [ 111.70566133  109.78863225  109.32833288 ...,  111.59148864
   110.42246852  110.61413547]
 [ 109.61264484  110.08329967  111.69544217 ...,  110.55138381
   110.49730606  110.01057567]]
(100, 123)


In [None]:
# y_resampled = y_resampled.reshape((16,1))

# reshape_dim = y_resampled.shape[0]*factor
X_multiplied,y_multiplied = multiply(X_resampled, y_resampled.reshape((16,1)), factor)
print X_multiplied.shape, y_multiplied.shape
print X_resampled.shape, y_resampled.shape

print(sorted(Counter(y_multiplied.reshape(y_multiplied.shape[0],)).items()))  #multipled class distribution

In [None]:
# Make an identity sampler
class FakeSampler(SamplerMixin):

    def fit(self, X, y):
        self.ratio_ = 1
        self.X_hash_ = hash_X_y(X, y)
        return self

    def sample(self, X, y):
        return X,

    def _sample(self, X, y):
        pass

    def fit_sample(self, X, y):
        return X, y
    
fig, ((ax1, ax2)) = plt.subplots(1, 2, figsize=(15, 15))
sampler = FakeSampler()
plot_resampling(X_resampled, y_resampled, sampler, ax1)
ax1.set_title('Original data - y={}'.format(Counter(y)))

plot_resampling(X_multiplied,y_multiplied, sampler, ax2)
ax2.set_title('Original data - y={}'.format(Counter(y)))

In [None]:
plt.show()

In [None]:
print X_multiplied.shape
print y_multiplied.shape

In [None]:
# TODO: plot t-sne
np.save('data/X_multiplied',X_multiplied)
np.save('data/y_multiplied',y_multiplied)