trained base model

MD2Korg · Jan 15, 2023 · 9216522 · 9216522
1 parent 4e9fe8b
commit 9216522
Show file tree

Hide file tree

Showing 2 changed files with 284 additions and 16 deletions.
diff --git a/divide_data_by_labels.ipynb b/divide_data_by_labels.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,12 +11,13 @@
     "from scipy.interpolate import interp1d\n",
     "import numpy as np\n",
     "import pandas as pd\n",
-    "import os"
+    "import os\n",
+    "from joblib import Parallel, delayed"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,7 +27,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,25 +63,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Exercise (244, 8) (71, 8)\n",
-      "Sports (40782, 8) (10204, 8)\n",
-      "Stairs (4669, 8) (1178, 8)\n",
-      "Stationery (286619, 8) (71662, 8)\n",
-      "Walking (11191, 8) (2809, 8)\n"
+      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.\n",
+      "[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   17.1s remaining:   11.4s\n",
+      "[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished\n"
      ]
     }
    ],
    "source": [
     "def interpolate_all_data(a):\n",
     "    a = a[a[:,0].argsort()]\n",
     "    a = a[:,1:].reshape(-1,3)\n",
+    "    a = interpolate_acl(a)\n",
     "    return a.reshape(1,-1,3)\n",
     "\n",
     "\n",
@@ -92,9 +92,7 @@
     "        df = df.iloc[int(train_split*df.shape[0]):]\n",
     "    return df\n",
     "\n",
-    "\n",
-    "data_directory = './data/mORAL_dataset_for_python_upload_09072020/processed_data/'\n",
-    "for i,df in all_data.groupby('prediction',as_index=False):\n",
+    "def save_data_by_labels(df):\n",
     "    activity = df['prediction'].values[0]\n",
     "    if not os.path.isdir(os.path.join(data_directory,activity)):\n",
     "        os.makedirs(os.path.join(data_directory,activity))\n",
@@ -105,9 +103,44 @@
     "    test_data = pd.concat([get_data(df_user,split_type='test') for j,df_user in df.groupby('user',as_index=False)]).sort_values('timestamp').reset_index(drop=True)\n",
     "    test_path = os.path.join(data_directory,activity,'test.p')\n",
     "    pickle.dump(test_data,open(test_path,'wb'))\n",
-    "    print(activity,train_data.shape,test_data.shape)    \n",
+    "    print(activity,train_data.shape,test_data.shape)\n",
+    "    return activity,train_data.shape,test_data.shape\n",
+    "\n",
+    "data_directory = './data/mORAL_dataset_for_python_upload_09072020/processed_data/'\n",
+    "done = Parallel(n_jobs=-1,verbose=2)(delayed(save_data_by_labels)(df) for i,df in all_data.groupby('prediction',as_index=False))   \n",
     "    "
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Exercise', (244, 8), (71, 8)),\n",
+       " ('Sports', (40782, 8), (10204, 8)),\n",
+       " ('Stairs', (4669, 8), (1178, 8)),\n",
+       " ('Stationery', (286619, 8), (71662, 8)),\n",
+       " ('Walking', (11191, 8), (2809, 8))]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "done"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/train_model.ipynb b/train_model.ipynb
@@ -0,0 +1,235 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import numpy as np\n",
+    "from tensorflow_addons.losses import metric_learning\n",
+    "import tensorflow_probability as tfp\n",
+    "from tensorflow.keras import layers, models, callbacks\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from tensorflow.keras import backend as K\n",
+    "import pickle\n",
+    "import os\n",
+    "\n",
+    "def _pairwise_distances(feature_A, feature_B=None, squared=False):\n",
+    "    \"\"\"\n",
+    "    Directly from https://www.tensorflow.org/api_docs/python/tf/contrib/losses/metric_learning/triplet_semihard_loss\n",
+    "    Computes the pairwise distance matrix with numerical stability.\n",
+    "    output[i, j] = || feature[i, :] - feature[j, :] ||_2\n",
+    "    Args:\n",
+    "      feature_A: 2-D Tensor of size [number of data A, feature dimension].\n",
+    "      feature_B: 2-D Tensor of size [number of data B, feature dimension].\n",
+    "      squared: Boolean, whether or not to square the pairwise distances.\n",
+    "    Returns:\n",
+    "      pairwise_distances: 2-D Tensor of size [number of data A, number of data B].\n",
+    "    \"\"\"\n",
+    "    if feature_B is None:\n",
+    "        feature_B = feature_A\n",
+    "\n",
+    "    pairwise_distances_squared = tf.add(\n",
+    "        tf.reduce_sum(tf.square(feature_A), axis=[1], keepdims=True),\n",
+    "        tf.reduce_sum(tf.square(tf.transpose(feature_B)), axis=[0], keepdims=True),\n",
+    "    ) - 2.0 * tf.linalg.matmul(feature_A, tf.transpose(feature_B))\n",
+    "\n",
+    "    # Deal with numerical inaccuracies. Set small negatives to zero.\n",
+    "    pairwise_distances_squared = tf.maximum(pairwise_distances_squared, 0.0)\n",
+    "    # Get the mask where the zero distances are at.\n",
+    "    error_mask = tf.less_equal(pairwise_distances_squared, 0.0)\n",
+    "\n",
+    "    # Optionally take the sqrt.\n",
+    "    if squared:\n",
+    "        pairwise_distances = pairwise_distances_squared\n",
+    "    else:\n",
+    "        pairwise_distances = tf.sqrt(\n",
+    "            pairwise_distances_squared + tf.cast(error_mask, tf.float32) * 1e-16\n",
+    "        )\n",
+    "\n",
+    "    # Undo conditionally adding 1e-16.\n",
+    "    pairwise_distances = tf.multiply(\n",
+    "        pairwise_distances, tf.cast(tf.logical_not(error_mask), tf.float32)\n",
+    "    )\n",
+    "\n",
+    "    if feature_B is None:\n",
+    "        num_data = tf.shape(feature_A)[0]\n",
+    "        # Explicitly set diagonals to zero.\n",
+    "        mask_offdiagonals = tf.ones_like(pairwise_distances) - tf.linalg.diag(\n",
+    "            tf.ones([num_data])\n",
+    "        )\n",
+    "        pairwise_distances = tf.multiply(pairwise_distances, mask_offdiagonals)\n",
+    "\n",
+    "    return pairwise_distances\n",
+    "def get_consistency_distinction_loss(labels,embeddings):\n",
+    "    epsilon = 1e-7\n",
+    "    lshape = tf.shape(labels)\n",
+    "    labels = tf.reshape(labels, [lshape[0], 1])\n",
+    "    clusters_labels, _, num_embeddings_per_cluster = tf.unique_with_counts(\n",
+    "        tf.reshape(labels, [lshape[0]])\n",
+    "        )    \n",
+    "    num_clusters = tf.size(clusters_labels)\n",
+    "    adjacency = tf.equal(\n",
+    "        labels, tf.transpose(clusters_labels)\n",
+    "    )  \n",
+    "    centroids = tf.linalg.matmul(\n",
+    "        tf.cast(adjacency, dtype=tf.float32), embeddings, transpose_a=True\n",
+    "    )\n",
+    "    centroids = tf.divide(\n",
+    "        centroids,\n",
+    "        tf.expand_dims(tf.cast(num_embeddings_per_cluster, dtype=tf.float32), axis=1),\n",
+    "    )\n",
+    "    pairwise_distances_distinction_first = _pairwise_distances(\n",
+    "        feature_A=embeddings, feature_B=centroids, squared=True\n",
+    "    )\n",
+    "    pairwise_distances_distinction_first = pairwise_distances_distinction_first/tf.reshape(tf.reduce_max(pairwise_distances_distinction_first,axis=1),[lshape[0],1])\n",
+    "    adjacency_not = tf.logical_not(adjacency)\n",
+    "    pairwise_distances_distinction = tf.where(tf.cast(adjacency,tf.float32)==1.0,tf.reduce_max(pairwise_distances_distinction_first),pairwise_distances_distinction_first)\n",
+    "    minimum_distance_to_other_cluster = tf.reduce_min(pairwise_distances_distinction,axis=1)\n",
+    "    distinction_loss = tf.reduce_mean(minimum_distance_to_other_cluster)\n",
+    "    mean_intra_class_distance = tf.reduce_mean(tf.boolean_mask(pairwise_distances_distinction_first,adjacency))\n",
+    "    mean_inter_class_distance = tf.reduce_mean(tf.boolean_mask(pairwise_distances_distinction_first,tf.logical_not(adjacency)))\n",
+    "    alpha = mean_intra_class_distance/ (mean_inter_class_distance+epsilon)\n",
+    "    mask_for_equal = tf.math.equal(labels,tf.transpose(labels))\n",
+    "    pairwise_distances = _pairwise_distances(embeddings,squared=True)\n",
+    "    pairwise_distances = pairwise_distances/tf.reshape(tf.reduce_max(pairwise_distances,axis=1),[lshape[0],1])    \n",
+    "    pairwise_distance_for_consistency  = tf.multiply(pairwise_distances, tf.cast(mask_for_equal,tf.float32))\n",
+    "    counts_same_class = tf.reduce_sum(tf.cast(mask_for_equal,tf.float32),axis=1)\n",
+    "    total_distance_same_class = tf.reduce_sum(pairwise_distance_for_consistency,axis=1)\n",
+    "    mean_distance_same_class = total_distance_same_class/(counts_same_class-1+epsilon)\n",
+    "    loss = tf.constant(0,dtype=tf.float32)\n",
+    "    for label in clusters_labels:\n",
+    "        percentile_95 = tfp.stats.percentile(tf.where(labels==label,mean_distance_same_class,tf.reduce_max(mean_distance_same_class)),95)\n",
+    "        loss+=percentile_95\n",
+    "    consistency_loss = loss/tf.cast(tf.size(clusters_labels),tf.float32)\n",
+    "    # consistency_loss = tf.reduce_mean(mean_distance_same_class)\n",
+    "    return  (1+alpha)*consistency_loss - distinction_loss\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_trained_model(X_train,y_train,n_timesteps,n_channels,window_size,filepath):\n",
+    "    n_classes = len(np.unique(y_train))\n",
+    "    model =  get_model(input_shape=(n_timesteps,n_channels),n_classes=n_classes)\n",
+    "    checkpoint = callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min',save_weights_only=False)\n",
+    "    es = callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0,patience=40)\n",
+    "    callbacks_list = [es,checkpoint]\n",
+    "    train_x,val_x,train_y,val_y = train_test_split(X_train,y_train,test_size=.2,stratify=y_train)\n",
+    "    history = model.fit(train_x,[train_y,train_y],validation_data=(val_x,[val_y,val_y]), epochs=200, batch_size=500,verbose=1,callbacks=callbacks_list,shuffle=True)\n",
+    "    model.load_weights(filepath)\n",
+    "    val_y_pred = model.predict(val_x)\n",
+    "    if len(val_y_pred)<val_x.shape[0]:\n",
+    "        val_y_pred = val_y_pred[0]\n",
+    "    print('validation accuracy',accuracy_score(val_y,val_y_pred.argmax(axis=1)),end=',')\n",
+    "    return model\n",
+    "import tensorflow_addons as tfa\n",
+    "def get_model(input_shape=(400,3),n_classes=1):\n",
+    "    input_ = layers.Input(shape=input_shape)\n",
+    "    x = layers.Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same')(input_)\n",
+    "    x = layers.MaxPooling1D(2)(x)\n",
+    "    x = layers.BatchNormalization()(x)\n",
+    "    x = layers.Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same')(x)\n",
+    "    x = layers.MaxPooling1D(2)(x)\n",
+    "    x = layers.BatchNormalization()(x)\n",
+    "    # x = layers.Activation('tanh')(x)\n",
+    "    x = layers.Dropout(.4)(x)\n",
+    "    x = layers.GRU(128,return_sequences=False,activation='tanh')(x)\n",
+    "    x = layers.Flatten()(x)\n",
+    "    x = layers.Dense(350,activation='relu')(x)\n",
+    "    x = layers.Dense(n_classes,activation='relu')(x)\n",
+    "    y2 = layers.Lambda(lambda a:K.l2_normalize(a,axis=1),name='feature')(x)\n",
+    "    y1 = layers.Dense(n_classes,activation='softmax',name='final')(y2)\n",
+    "    model = models.Model(input_,[y1,y2])\n",
+    "    model.compile(loss={'final':tf.keras.losses.SparseCategoricalCrossentropy(),\n",
+    "                        'feature':get_consistency_distinction_loss},\n",
+    "                  loss_weights = {'final':5,'feature':1},\n",
+    "                  optimizer='adam',\n",
+    "                  metrics={'final':['acc']})\n",
+    "    return model\n",
+    "def get_X_y_dict(training_data,user_dict = None):\n",
+    "    if user_dict is None:\n",
+    "        user_dict = {a:i for i,a in enumerate(training_data['user'].unique())}\n",
+    "    training_data['label'] = training_data['user'].apply(lambda a:user_dict[a])\n",
+    "    X = np.concatenate(list(training_data['final_data']))\n",
+    "    y = np.array(training_data['label'].values)\n",
+    "    return X,y,user_dict\n",
+    "activity_label = 'Walking'\n",
+    "window_size = 20\n",
+    "base_directory = './data/mORAL_dataset_for_python_upload_09072020/'\n",
+    "training_data = pickle.load(open(os.path.join(base_directory,'processed_data',activity_label,'train.p'),'rb')).sort_values('timestamp').reset_index(drop=True)\n",
+    "testing_data = pickle.load(open(os.path.join(base_directory,'processed_data',activity_label,'test.p'),'rb')).sort_values('timestamp').reset_index(drop=True)\n",
+    "if not os.path.isdir(os.path.join(base_directory,'results',activity_label)):\n",
+    "    os.makedirs(os.path.join(base_directory,'results',activity_label))\n",
+    "result_directory = os.path.join(base_directory,'results',activity_label)\n",
+    "model_directory = os.path.join(result_directory,'activity_{}_window_size_{}.h5'.format(activity_label,window_size))\n",
+    "X_train,y_train,user_dict = get_X_y_dict(training_data)\n",
+    "X_test,y_test,user_dict =  get_X_y_dict(testing_data,user_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trained_model = get_trained_model(X_train,y_train,n_timesteps=X_train.shape[1],n_channels=X_train.shape[-1],window_size=window_size,filepath=model_directory)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred_test = trained_model.predict(X_test)\n",
+    "testing_data['embedding'] = list(y_pred_test[0])\n",
+    "testing_data['prediction'] = list(y_pred_test[0].argmax(axis=1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle.dump(testing_data,open(os.path.join(result_directory,'activity_{}_window_size_{}.p'.format(activity_label,window_size)),'wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test1",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "51b244ab9aca612e739a0539ae1af887c58db9e180d786deb0ab1761def69c1f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}