In [1]:
# --- added to file ----
# Takes in a String, "bucket_name", a string, "remote_folder",
# and a list of strings or a single string, "keywords". Gets all
# s3 keys for bucket_name/remote_folder. Uses a list convention
# to go through keywords (i.e): ['a', 'b', 'c OR d OR e'] will 
# find all files containing 'a' and 'b' and either 'c', 'd', or 'e'.
# Using '' will return every file key in folder.
def get_s3_keys(bucket_name, remote_folder, keywords=''):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    obj_list = []
    keywords = [i.split('OR') for i in list(keywords)]
    keywords = [list(map(lambda x:x.strip(), i)) for i in keywords]
    for object in bucket.objects.all():
        filename = object.key.split("/")[-1]
        kwds_in = all(any(k in filename for k in ([keyword]*isinstance(keyword, str) or keyword)) for keyword in keywords)
        if remote_folder in object.key and kwds_in:
            obj_list.append(s3.Object(object.bucket_name, object.key))
    return obj_list

In [2]:
import pandas as pd
from os import listdir, getcwd, chdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import csv

In [3]:
!pwd

/c/Users/david/Documents/nancework/source/diff_predictor/notebooks


In [4]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import csv

# Takes in a path and list of keywords. Returns a list of filenames
# that are within the path that contain one of the keyword in the list.
# Set keyword to "" to get all files in the path.
def get_files(path, keywords = ["features_ OR msd_"]):
    """
    Takes in a path and list of keywords. Returns a list of filenames
    that are within the path that contain one of the keyword in the list.
    Set keyword to "" to get all files in the path.
    """
    keywords = [i.split('OR') for i in list(keywords)]
    keywords = [list(map(lambda x:x.strip(), i)) for i in keywords]
    files = [f for f in listdir(path) if isfile(join(path, f))]
    file_list = []
    for filename in files:
        kwds_in = all(any(k in filename for k in ([keyword]*isinstance(keyword, str) or keyword)) for keyword in keywords)
        if (kwds_in):
            file_list.append(filename)
    return file_list

# Pre: Both files must exhist; Feature must be in the feature file
# Throws a FileNotFoundError exception if preconditions not met
#
# Adds a feature from produced features file to the track file.
def combine_track(trackFile, feature=None, featureDF=None):
    '''
    Adds a feature or set of feature to the corresponding track file
    Preconditions: Both files must exhist; Feature(s) must be in the 
    feature file. 
    
    Input:
    ------
    trackFile : string :
        The file location of the dataframe 
    feature : list : string : tuple :
        feature or set of features to attach to track dataframe
    Output:
    -------
    trackDF : pd.DataFrame :
        DataFrame of the combined tracks
    '''
    if isinstance(trackFile, str):
        try:
            trackDF = pd.read_csv(trackFile)
        except FileNotFoundError:
            raise("DataFrame cannot be located")
    else:
        trackDF = trackFile
    if featureDF is None:
        featureDF = find_pair(trackFile)
    if feature is None:
        feature = np.setdiff1d(featureDF.columns.values, trackDF.columns.values)
    elif isinstance(feature, str):
        feature = [feature]
    elif isinstance(feature, tuple):
        feature = list(feature)
    trackDF = trackDF.reindex(columns=[*trackDF.columns.tolist()] + [*feature], fill_value=np.nan)
    maxFrames = int(trackDF["Frame"].max())
    maxTracks = int(trackDF["Track_ID"].max())
    for i in range(int(maxTracks)+1):
        for feat in feature:
            trackFeature = featureDF.loc[i, feat]
            trackDF.loc[(maxFrames)*(i+1) + i, feat] = trackFeature
    return trackDF

# Trys to find the feature file pair for either the msd_ or Traj_
# Returns the pd.DataFrame of that pair if found.
def find_pair(filename):
    """
    Trys to find the feature file pair for either the msd_ or traj_ df,
    or the Traj_ or msd_ file for input feauture_ file.
    Returns the pd.DataFrame of that pair if found.
    """
    if "msd_" in filename:
        try:
            filename = filename.replace("msd_", "").replace("Traj_", "")
            filename = filename.split("/")
            filename[-1] = "features_" + filename[-1]
            featureFile = "/".join(filename)
            return pd.read_csv(featureFile)
        except FileNotFoundError:
            print("File pair could not be found")  
    elif "features_" in filename:
        try:
            filename = filename.replace("features_", "")
            filename = filename.split("/")
            filename[-1] = "msd_" + filename[-1]
            featureFile = "/".join(filename)
            return pd.read_csv(featureFile)
        except:
            try:
                filename = filename.replace("features_", "")
                filename = filename.split("/")
                filename[-1] = "Traj_" + filename[-1]
                featureFile = "/".join(filename)
                return pd.read_csv(featureFile)
            except FileNotFoundError:
                print("File pair could not be found")

In [5]:
if not 'workbookDir' in globals():
    workbookDir = getcwd()
print('Current Notebook Dir: ' + workbookDir)
chdir(workbookDir) # Go to current workbook Dir
chdir('..')        # Go up one
workbookDir = getcwd()
print(f'Using current directory for loading data: {getcwd()}')

Current Notebook Dir: C:\Users\david\Documents\nancework\source\diff_predictor\notebooks
Using current directory for loading data: C:\Users\david\Documents\nancework\source\diff_predictor


In [6]:
dataset_path = './raw_data_region_cortex_striatum'
track_file_list = get_files(dataset_path, keywords=['msd_'])
feature_file_list = get_files(dataset_path, ['features_'])

In [7]:
!pwd

/c/Users/david/Documents/nancework/source/diff_predictor


In [8]:
feature_file_list

['features_NT_slice_1_cortex_vid_10.csv',
 'features_NT_slice_1_cortex_vid_6.csv',
 'features_NT_slice_1_cortex_vid_7.csv',
 'features_NT_slice_1_cortex_vid_8.csv',
 'features_NT_slice_1_cortex_vid_9.csv',
 'features_NT_slice_1_striatum_vid_1.csv',
 'features_NT_slice_1_striatum_vid_2.csv',
 'features_NT_slice_1_striatum_vid_3.csv',
 'features_NT_slice_1_striatum_vid_4.csv',
 'features_NT_slice_1_striatum_vid_5.csv',
 'features_NT_slice_2_cortex_vid_1.csv',
 'features_NT_slice_2_cortex_vid_2.csv',
 'features_NT_slice_2_cortex_vid_3.csv',
 'features_NT_slice_2_cortex_vid_4.csv',
 'features_NT_slice_2_cortex_vid_5.csv',
 'features_NT_slice_2_striatum_vid_1.csv',
 'features_NT_slice_2_striatum_vid_2.csv',
 'features_NT_slice_2_striatum_vid_3.csv',
 'features_NT_slice_2_striatum_vid_4.csv',
 'features_NT_slice_2_striatum_vid_5.csv']

In [8]:
fstats_tot = None
video_num = 0
for filename in feature_file_list:
    try:
        fstats = pd.read_csv(dataset_path + '/' + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
        tstats = find_pair(dataset_path + '/' + filename)
        print('{} size: {}'.format(filename, fstats.shape))
        if 'cortex' in filename:
            fstats['region'] = pd.Series(fstats.shape[0]*[0], index=fstats.index)
        elif 'striatum' in filename: 
            fstats['region'] = pd.Series(fstats.shape[0]*[1], index=fstats.index)
        else:
            print('Error, no target')
        fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
        fstats = combine_track(tstats, feature=np.append(feat, ['region']), featureDF=fstats)
        if fstats_tot is None:
            fstats_tot = fstats
        else:
            fstats_tot = fstats_tot.append(fstats, ignore_index=True)
        video_num += 1
    except Exception:
        print('Skipped!: {}'.format(filename))

features_NT_slice_1_cortex_vid_10.csv size: (4832, 23)
features_NT_slice_1_cortex_vid_6.csv size: (7990, 23)
features_NT_slice_1_cortex_vid_7.csv size: (4159, 23)
features_NT_slice_1_cortex_vid_8.csv size: (1984, 23)
features_NT_slice_1_cortex_vid_9.csv size: (6506, 23)
features_NT_slice_1_striatum_vid_1.csv size: (2431, 23)
features_NT_slice_1_striatum_vid_2.csv size: (2240, 23)
features_NT_slice_1_striatum_vid_3.csv size: (1536, 23)
features_NT_slice_1_striatum_vid_4.csv size: (2177, 23)
features_NT_slice_1_striatum_vid_5.csv size: (2169, 23)
features_NT_slice_2_cortex_vid_1.csv size: (1388, 23)
features_NT_slice_2_cortex_vid_2.csv size: (1784, 23)
features_NT_slice_2_cortex_vid_3.csv size: (3520, 23)
features_NT_slice_2_cortex_vid_4.csv size: (1429, 23)
features_NT_slice_2_cortex_vid_5.csv size: (2210, 23)
features_NT_slice_2_striatum_vid_1.csv size: (8314, 23)
features_NT_slice_2_striatum_vid_2.csv size: (10500, 23)
features_NT_slice_2_striatum_vid_3.csv size: (11355, 23)
features_

In [44]:
fstats_tot

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Frame,Gauss,MSDs,Mean_Intensity,Quality,SN_Ratio,Track_ID,X,...,asymmetry3,boundedness,efficiency,elongation,fractal_dim,frames,kurtosis,straightness,trappedness,region
0,0,0.0,0.0,0.000000,0.000000,288.015504,5.162109,0.701482,0.0,74.264553,...,,,,,,,,,,
1,1,1.0,1.0,0.753676,0.371693,288.348837,5.122284,0.643396,0.0,74.564264,...,,,,,,,,,,
2,2,2.0,2.0,0.797005,0.385012,290.767442,5.700928,0.706083,0.0,74.879671,...,,,,,,,,,,
3,3,3.0,3.0,0.863716,0.401320,289.759690,5.593384,0.756702,0.0,73.889012,...,,,,,,,,,,
4,4,4.0,4.0,0.806647,0.423011,287.852713,5.412109,0.607062,0.0,74.546994,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65555044,373669,,646.0,,,491.825028,7.181488,1.080613,13937.0,1648.350759,...,,,,,,,,,,
65555045,373670,,647.0,,,492.201550,7.321686,1.082030,13937.0,1648.158269,...,,,,,,,,,,
65555046,373671,,648.0,,,494.302326,8.170197,1.103524,13937.0,1649.082272,...,,,,,,,,,,
65555047,373672,,649.0,,,495.217054,8.179382,1.150940,13937.0,1647.846105,...,,,,,,,,,,


In [42]:
filename = 'features_NT_slice_2_striatum_vid_5.csv'
fstats = pd.read_csv(dataset_path + '/' + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
tstats = find_pair(dataset_path + '/' + filename)
print('{} size: {}'.format(filename, fstats.shape))
if 'cortex' in filename:
    fstats['region'] = pd.Series(fstats.shape[0]*[0], index=fstats.index)
elif 'striatum' in filename: 
    fstats['region'] = pd.Series(fstats.shape[0]*[1], index=fstats.index)
else:
    print('Error, no target')
fstats = combine_track(tstats, feature=np.append(feat, ['region']), featureDF=fstats)
if fstats_tot is None:
    fstats_tot = fstats
else:
    fstats_tot = fstats_tot.append(fstats, ignore_index=True)

features_NT_slice_2_striatum_vid_5.csv size: (13938, 23)


In [45]:
# fstats_tot.to_csv('cortex_striatum_featuresandtracks.csv')

In [9]:
fstats_tot = pd.read_csv('saved_datasets/cortex_striatum_featuresandtracks.csv')

In [9]:
feat = np.array(['AR', 'D_fit', 'Deff1', 'Deff2', 'MSD_ratio', 'alpha',
       'asymmetry1', 'asymmetry2', 'asymmetry3', 'boundedness',
       'efficiency', 'elongation', 'fractal_dim', 'frames', 'kurtosis',
       'straightness', 'trappedness'])

In [12]:
def zero_df(df, col, res=(0, 651)):
    '''
    Zeros a single dataframe column so that the first value will be
    located at the start of the track.
    '''
    try:
        shift_val = df.iloc[res[0]:res[1]][col].reset_index().dropna().index[0]
    except:
        shift_val = res[0]-res[1]-1
    return df.iloc[res[0]:res[1]][col].reset_index().shift(-shift_val, fill_value=np.nan)[col]

def get_zeroed_tracks(df, col, res=650):
    '''
    Creates an array of all the tracks for a single column in a file
    in which the value is zeroed to frame = 0
    '''
    lower = 0
    upper = res+1
    value = []
    while (upper <= len(df)):
        value.append(list(zero_df(df, col=col, res=[lower, upper])))
        lower = upper
        upper = lower + res + 1
    return value

In [13]:
import numpy as np
import pandas as pd

# Creates x and y datasets for LSTM based off of input
# track_df data
def get_xy_data(df, target, feat=None, use_feat=False, res=650):
    n_tracks = int((len(df))/(res+1))
    frame = get_zeroed_tracks(df, 'Frame', res=res)
    X = get_zeroed_tracks(df, 'X', res=res)
    Y = get_zeroed_tracks(df, 'Y', res=res)
    MSDs = get_zeroed_tracks(df, 'MSDs', res=res)
    trgt = df[target]
    datax = []
    datay = []
    datafeat = []
    print(n_tracks)
    for j in range(n_tracks):
        trackx = []
        tracky = []
        trackfeat = []
        for i in range(res+1):
            trackx.append([int(frame[j][i]), X[j][i], Y[j][i], MSDs[j][i]])
        datax.append(trackx)
        del(trackx)
        tracky.append(trgt[(res+1)*(j+1)-1])
        datay.append(tracky)
        del(tracky)
        if use_feat is True:
            trackfeat.append(list(df.loc[(res+1)*(j+1)-1, feat]))
        datafeat.append(trackfeat)
        del(trackfeat)
    del(df, frame, X, Y, MSDs, trgt)
    datax = np.array(datax)
    datax = datax.reshape(n_tracks, res+1, 4)
    datay = np.array(datay)
    datay = datay.reshape(n_tracks, 1)
    datafeat = np.array(datafeat)
    datafeat = datafeat.reshape(n_tracks, len(feat))
    result = [datax, datay]
    if use_feat is True:
        result += [datafeat]
    return tuple(result)

In [60]:
feat

array(['AR', 'D_fit', 'Deff1', 'Deff2', 'MSD_ratio', 'alpha',
       'asymmetry1', 'asymmetry2', 'asymmetry3', 'boundedness',
       'efficiency', 'elongation', 'fractal_dim', 'frames', 'kurtosis',
       'straightness', 'trappedness'], dtype='<U12')

In [14]:
(datax, datay, datafeat) = get_xy_data(fstats_tot, 'region', feat, True)

86761


In [16]:
def get_track(df, track, res):
    return df.loc[(res+1)*(track):(res+1)*(track+1)-1]

def get_feat(df, track, res, feat):
    return df.loc[(res+1)*(track+1)-1, feat]

In [15]:
np.save('./saved_datasets/RNN_region_datax', datax)
np.save('./saved_datasets/RNN_region_datay', datay)
np.save('./saved_datasets/RNN_region_datafeat', datafeat)
# datax = np.load('./saved_datasets/RNN_region_datax.npy')
# datay = np.load('./saved_datasets/RNN_region_datay.npy')
# datafeat = np.load('./saved_datasets/RNN_region_datafeat.npy')

In [23]:
split = 0.8
train_index = np.random.choice(np.arange(0, len(datax)), int(len(datax)*0.7), replace=False)
test_index = np.setdiff1d(np.arange(0, len(datax)), train_index)
datax = np.nan_to_num(datax, copy=True, nan=-1.0, posinf=-1.0, neginf=-1.0)
datay = np.nan_to_num(datay, copy=True, nan=-1.0, posinf=-1.0, neginf=-1.0)
X_train = datax[train_index]
y_train = datay[train_index]
feat_train = datafeat[train_index]
X_test = datax[test_index]
y_test = datay[test_index]
feat_test = datafeat[test_index]

In [24]:
def numpy_one_hot_encode(mat, encoder=None):
    if encoder is None:
        encoder = np.unique(mat)
    mat = np.array(encoder == mat).astype(int)
    return mat, encoder
y_train, encoder = numpy_one_hot_encode(y_train)
y_test, encoder = numpy_one_hot_encode(y_test, encoder)

In [19]:
def numpy_decode(mat, encoder):
    return np.array([i[i!=0] for i in mat * encoder])
y_train = numpy_decode(y_train, encoder)
y_test = numpy_decode(y_test, encoder)

In [25]:
n_timesteps, n_features, n_outputs = X_train.shape[1], X_train.shape[2], y_train.shape[1]
(n_timesteps, n_features, n_outputs)

(651, 4, 2)

In [26]:
n_samples, n_feat_size = feat_train.shape
(n_samples, n_feat_size)

(60732, 17)

In [27]:
#Kera libraries
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, LSTM, Input, Dropout, Concatenate, Flatten, TimeDistributed
from tensorflow.keras.preprocessing import sequence

In [28]:
# LSTM without dropout for sequence classification in the IMDB dataset
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

def rnn_clsfy(X_train, y_train, n_timesteps, n_features, n_outputs, epochs=15, batch_size=64, verbose=0, **kwargs):
    if 'dropout' not in kwargs:
        dropout = 0.5
    else:
        dropout = kwargs['dropout']
    if 'seed' not in kwargs:
        seed = 123
    else:
        seed = kwargs['seed']
    if 'metrics' not in kwargs:
        metrics = ['accuracy']
    else:
        metrics = kwargs['metrics']
    if 'n_rnnnodes' not in kwargs:
        n_rnnnodes = 100
    else:
        n_rnnnodes = kwargs['n_rnnnodes']
    # create the model
    model = Sequential()
    model.add(LSTM(n_rnnnodes, input_shape=(n_timesteps, n_features), return_sequences=False))
    model.add(Dropout(dropout))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=metrics)
    print(model.summary())
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    # Final evaluation:
    score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=verbose)
    print(f'Accuracy: {score[1]}')
    return model

model = rnn_clsfy(X_train, y_train, n_timesteps, n_features, n_outputs, epochs=50, batch_size=100, verbose=0, dropout=0.4, seed=10, metrics = ['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               42000     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 52,302
Trainable params: 52,302
Non-trainable params: 0
_________________________________________________________________
None
Train on 60732 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epo

In [None]:
model.save('.\saved_models\LSTM_RNN_MODEL_50_50_SPLIT_Striatum_Cortex_TARGET_JUL3020_DATE_50_EPOCHS_40_DROPOUT_SHACK')

In [None]:
model = load_model('.\saved_models\LSTM_RNN_MODEL_70_20_SPLIT_Striatum_Cortex_TARGET_JUL3020_DATE_SHACK')

In [None]:
from utils.constants import MAX_NB_VARIABLES, MAX_TIMESTEPS_LIST
from utils.generic_utils import load_dataset_at, calculate_dataset_metrics, cutoff_choice, \
    cutoff_sequence
from keras import backend as K
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Permute
from keras.models import Model
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.simplefilter('ignore', category=DeprecationWarning)


def multi_label_log_loss(y_pred, y_true):
    return K.sum(K.binary_crossentropy(y_pred, y_true), axis=-1)


def _average_gradient_norm(model, X_train, y_train, batch_size):
    # just checking if the model was already compiled
    if not hasattr(model, "train_function"):
        raise RuntimeError("You must compile your model before using it.")

    weights = model.trainable_weights  # weight tensors

    get_gradients = model.optimizer.get_gradients(
        model.total_loss, weights)  # gradient tensors

    input_tensors = [
        # input data
        model.inputs[0],
        # how much to weight each sample by
        model.sample_weights[0],
        # labels
        model.targets[0],
        # train or test mode
        K.learning_phase()
    ]

    grad_fct = K.function(inputs=input_tensors, outputs=get_gradients)

    steps = 0
    total_norm = 0
    s_w = None

    nb_steps = X_train.shape[0] // batch_size

    if X_train.shape[0] % batch_size == 0:
        pad_last = False
    else:
        pad_last = True

    def generator(X_train, y_train, pad_last):
        for i in range(nb_steps):
            X = X_train[i * batch_size: (i + 1) * batch_size, ...]
            y = y_train[i * batch_size: (i + 1) * batch_size, ...]

            yield (X, y)

        if pad_last:
            X = X_train[nb_steps * batch_size:, ...]
            y = y_train[nb_steps * batch_size:, ...]

            yield (X, y)

    datagen = generator(X_train, y_train, pad_last)

    while steps < nb_steps:
        X, y = next(datagen)
        # set sample weights to one
        # for every input
        if s_w is None:
            s_w = np.ones(X.shape[0])

        gradients = grad_fct([X, s_w, y, 0])
        total_norm += np.sqrt(np.sum([np.sum(np.square(g))
                                      for g in gradients]))
        steps += 1

    if pad_last:
        X, y = next(datagen)
        # set sample weights to one
        # for every input
        if s_w is None:
            s_w = np.ones(X.shape[0])

        gradients = grad_fct([X, s_w, y, 0])
        total_norm += np.sqrt(np.sum([np.sum(np.square(g))
                                      for g in gradients]))
        steps += 1

    return total_norm / float(steps)


def rnn_train_model(model: Model, 
                    train_dataset, 
                    eval_dataset,
                    folds=5, 
                    epochs=50, 
                    batch_size=128, 
                    val_subset=None,
                    cutoff=None,  
                    learning_rate=1e-3, 
                    monitor='loss', 
                    optimization_mode='auto', 
                    compile_model=True):
    
    X_train, y_train, X_test, y_test, is_timeseries = load_dataset_at(dataset_id,
                                                                      fold_index=dataset_fold_id,
                                                                      normalize_timeseries=normalize_timeseries)
    max_timesteps, max_nb_variables = calculate_dataset_metrics(X_train)

    if max_nb_variables != MAX_NB_VARIABLES[dataset_id]:
        if cutoff is None:
            choice = cutoff_choice(dataset_id, max_nb_variables)
        else:
            assert cutoff in [
                'pre', 'post'], 'Cutoff parameter value must be either "pre" or "post"'
            choice = cutoff

        if choice not in ['pre', 'post']:
            return
        else:
            X_train, X_test = cutoff_sequence(
                X_train, X_test, choice, dataset_id, max_nb_variables)
            
    classes = np.unique(y_train)
    le = LabelEncoder()
    y_ind = le.fit_transform(y_train.ravel())
    recip_freq = len(y_train) / (len(le.classes_) *
                                 np.bincount(y_ind).astype(np.float64))
    class_weight = recip_freq[le.transform(classes)]

    print("Class weights : ", class_weight)

    y_train = to_categorical(y_train, len(np.unique(y_train)))
    y_test = to_categorical(y_test, len(np.unique(y_test)))

    if is_timeseries:
        factor = 1./np.cbrt(2)
    else:
        factor = 1./np.sqrt(2)

    if dataset_fold_id is None:
        weight_fn = "./weights/%s_weights.h5" % dataset_prefix
    else:
        weight_fn = "./weights/%s_fold_%d_weights.h5" % (
            dataset_prefix, dataset_fold_id)

    model_checkpoint = ModelCheckpoint(weight_fn, verbose=1, mode=optimization_mode,
                                       monitor=monitor, save_best_only=True, save_weights_only=True)
    reduce_lr = ReduceLROnPlateau(monitor=monitor, patience=100, mode=optimization_mode,
                                  factor=factor, cooldown=0, min_lr=1e-4, verbose=2)
    callback_list = [model_checkpoint, reduce_lr]

    optm = Adam(lr=learning_rate)

    if compile_model:
        model.compile(optimizer=optm,
                      loss='categorical_crossentropy', metrics=['accuracy'])

    if val_subset is not None:
        X_test = X_test[:val_subset]
        y_test = y_test[:val_subset]

    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callback_list,
              class_weight=class_weight, verbose=2, validation_data=(X_test, y_test))


def evaluate_model(model: Model, dataset_id, dataset_prefix, dataset_fold_id=None, batch_size=128, test_data_subset=None,
                   cutoff=None, normalize_timeseries=False):
    _, _, X_test, y_test, is_timeseries = load_dataset_at(dataset_id,
                                                          fold_index=dataset_fold_id,
                                                          normalize_timeseries=normalize_timeseries)
    max_timesteps, max_nb_variables = calculate_dataset_metrics(X_test)

    if max_nb_variables != MAX_NB_VARIABLES[dataset_id]:
        if cutoff is None:
            choice = cutoff_choice(dataset_id, max_nb_variables)
        else:
            assert cutoff in [
                'pre', 'post'], 'Cutoff parameter value must be either "pre" or "post"'
            choice = cutoff

        if choice not in ['pre', 'post']:
            return
        else:
            _, X_test = cutoff_sequence(
                None, X_test, choice, dataset_id, max_nb_variables)

    if not is_timeseries:
        X_test = pad_sequences(
            X_test, maxlen=MAX_NB_VARIABLES[dataset_id], padding='post', truncating='post')
    y_test = to_categorical(y_test, len(np.unique(y_test)))

    optm = Adam(lr=1e-3)
    model.compile(optimizer=optm, loss='categorical_crossentropy',
                  metrics=['accuracy'])

    if dataset_fold_id is None:
        weight_fn = "./weights/%s_weights.h5" % dataset_prefix
    else:
        weight_fn = "./weights/%s_fold_%d_weights.h5" % (
            dataset_prefix, dataset_fold_id)
    model.load_weights(weight_fn)

    if test_data_subset is not None:
        X_test = X_test[:test_data_subset]
        y_test = y_test[:test_data_subset]

    print("\nEvaluating : ")
    loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)
    print()
    print("Final Accuracy : ", accuracy)

    return accuracy, loss


def set_trainable(layer, value):
    layer.trainable = value

    # case: container
    if hasattr(layer, 'layers'):
        for l in layer.layers:
            set_trainable(l, value)

    # case: wrapper (which is a case not covered by the PR)
    if hasattr(layer, 'layer'):
        set_trainable(layer.layer, value)

In [1]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)

Using TensorFlow backend.


In [2]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000

In [3]:
import numpy as np
# save np.load
#np_load_old = np.load

# modify the default parameters of np.load
#np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# restore np.load for future normal usage
#np.load = np_load_old

In [4]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [21]:
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# create the model
embedding_vecor_length = 32
model = tf.keras.Sequential()
# model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x16ebc9168c8>

In [22]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.43%


In [34]:
# LSTM with Dropout for sequence classification in the IMDB dataset
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = tf.keras.Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 87.10%


In [36]:
# LSTM without dropout for sequence classification in the IMDB dataset
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = tf.keras.Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 85.66%


In [5]:
# LSTM and CNN for sequence classification in the IMDB dataset
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers.convolutional import Conv1D
from tensorflow.keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

ModuleNotFoundError: No module named 'tensorflow.keras.layers.convolutional'

In [None]:
model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [17]:
X_test

array([[   0,    0,    0, ...,   14,    6,  717],
       [   0,    0,    0, ...,  125,    4, 3077],
       [  33,    6,   58, ...,    9,   57,  975],
       ...,
       [   0,    0,    0, ...,   21,  846,    2],
       [   0,    0,    0, ..., 2302,    7,  470],
       [   0,    0,    0, ...,   34, 2005, 2643]], dtype=int32)

In [7]:
# LSTM with Dropout for sequence classification in msd dataset
import numpy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset b

(X_train, y_train)
(X_test, y_test)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

NameError: name 'X_train' is not defined

---

---

In [12]:
import collections
from tensorflow.python.ops.math_ops import tanh


class RNNCell(object):
    def __call__(self, inputs, state, scope=None):
        raise NotImplementedError("Abstract method")
    

class LSTMCell(RNNCell):
    """Basic LSTM recurrent network cell.
    The implementation is based on: http://arxiv.org/abs/1409.2329.
    We add forget_bias (default: 1) to the biases of the forget gate in order to
    reduce the scale of forgetting in the beginning of the training.
    It does not allow cell clipping, a projection layer, and does not
    use peep-hole connections: it is the basic baseline.
    For advanced models, please use the full LSTMCell that follows.
    """  
    def __init__(self, n_units, n_proj=None, forget_bias=0.0, input_size=None, activation=tanh):
        self._n_units  = n_units
        self._n_proj = n_proj
        self._forget_bias = forget_bias
        self._input_size = input_size
        self._activation = activation

        (self._state_size, 
         self._output_size) = ((LSTMStateTuple(n_units, n_proj) , n_units + n_proj)
                            if n_proj else (LSTMStateTuple(n_units, n_units), 2*n_units))

        
    @property
    def state_size(self):
        return self._state_size
    
    
    @property
    def output_size(self):
        return self. _output_size
    
    
    def __call__(self, inputs, state, scope=None):
        
        pass

# class LSTM(LSTM):
    
    
#     def __init__(self, ):
#         pass
    
_LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))

class LSTMStateTuple(_LSTMStateTuple):
  
    """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
    Stores two elements: `(c, h)`, in that order.
    Only used when `state_is_tuple=True`.
    """
    __slots__ = ()

    @property
    def dtype(self):
        (c, h) = self
        if not c.dtype == h.dtype:
            raise TypeError("Inconsistent internal state: %s vs %s" %
                            (str(c.dtype), str(h.dtype)))
    return c.dtype


In [18]:
x = LSTMCell(50, 20, 1.0, 128)


In [19]:
x.state_size

LSTMStateTuple(c=50, h=20)

In [20]:
x = LSTMCell()

TypeError: __init__() missing 1 required positional argument: 'n_units'