In [8]:
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import tensorflow as tf
import tempfile
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from scipy import stats

In [2]:
schema_flow = (StructType().add('Index', IntegerType(), True)
        .add('Arrival_Time', LongType(), True)
        .add('Creation_Time', LongType(), True)
        .add('x', DoubleType(), True)
        .add('y', DoubleType(), True)
        .add('z', DoubleType(), True)
        .add('User', StringType(), True)
        .add('Model', StringType(), True)
        .add('Device', StringType(), True)
        .add('gt', StringType(), True))

In [3]:
sc = spark.sparkContext
sql = SQLContext(sc)

In [4]:
df_raw = (sql.read
    .format("com.databricks.spark.csv")
    .option("header", "true")
    .schema(schema_flow)
    .load("Phones_accelerometer.csv")
    .rdd)

AttributeError: 'RDD' object has no attribute 'show'

In [5]:
header = df_raw.first()
test = df_raw.filter(lambda line: line != header).toDF()


In [6]:
my_udf = udf(lambda x: x / 1000000000)
new_df = test.withColumn('Creation_Time_sec', my_udf('Creation_Time'))

In [7]:
pandas_df = pd.DataFrame(new_df
    .drop('Index', 'Creation_Time', 'User', 'Model', 'Device', 'Arrival_Time', 'Creation_Time_sec')
    .collect(), columns=['x', 'y', 'z', 'gt'])

In [9]:
print pandas_df.shape

(13062474, 4)


In [10]:
pandas_df.head(5)

Unnamed: 0,x,y,z,gt
0,-5.95224,0.670212,8.136536,stand
1,-5.995087,0.653549,8.204376,stand
2,-5.942718,0.676163,8.128204,stand
3,-5.991516,0.641647,8.135345,stand
4,-5.965332,0.629745,8.128204,stand


In [11]:
N_TIME_STEPS = 200
N_FEATURES = 3
step = 20
segments = []
labels = []

for i in range(0, len(pandas_df) - N_TIME_STEPS, step):
    xs = pandas_df['x'].values[i: i + N_TIME_STEPS]
    ys = pandas_df['y'].values[i: i + N_TIME_STEPS]
    zs = pandas_df['z'].values[i: i + N_TIME_STEPS]
    label = stats.mode(pandas_df['gt'][i: i + N_TIME_STEPS])[0][0]
    segments.append([xs, ys, zs])
    labels.append(label)



In [13]:
reshaped_segments = np.asarray(segments, dtype= np.float32).reshape(-1, N_TIME_STEPS, N_FEATURES)
labels_np = np.asarray(labels)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(reshaped_segments, labels_np, test_size=0.2)

In [15]:
print X_train.shape
print X_test.shape

(522491, 200, 3)
(130623, 200, 3)


In [16]:
x_train_reshaped = np.transpose(X_train, (1,0,2)).reshape(-1,3)
x_test_reshaped = np.transpose(X_test, (1,0,2)).reshape(-1,3)

In [17]:
x_train_reshaped.shape
x_test_reshaped.shape

(26124600, 3)

In [19]:
print y_train.shape
print y_test.shape

(522491,)
(130623,)


In [20]:
np.savetxt("x_train.csv", x_train_reshaped, delimiter=",")
np.savetxt("x_test.csv", x_test_reshaped, delimiter=",")

In [21]:
np.savetxt("y_train.csv", y_train, delimiter=",", fmt="%s")
np.savetxt("y_test.csv", y_test, delimiter=",", fmt="%s")