In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark.sql.functions as func
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [2]:
def toDoubleSafe(value): 
    try:
        return float(value)
    except ValueError:
        return None

In [3]:
def toIntSafe(value): 
    try:
        return int(value)
    except ValueError:
        return None

In [5]:
argo = sc.textFile("argo_data.csv") \
         .map(lambda x: x.split(",")) \
         .map(lambda x: (x[0], 
                         toDoubleSafe(x[1]), 
                         toDoubleSafe(x[2]),
                         toDoubleSafe(x[3]),
                         toDoubleSafe(x[4]),
                         toDoubleSafe(x[5]),
                         x[6]))
argo_schema = StructType([StructField("profile_id", StringType(), True),
                          StructField("pres", DoubleType(), True),
                          StructField("temp", DoubleType(), True),
                          StructField("lat", DoubleType(), True),
                          StructField("lon", DoubleType(), True),
                          StructField("psal", DoubleType(), True),
                          StructField("date", TimestampType(), True),
                        ])
argo_df = ss.read.csv("argo_data.csv", header=True, schema=argo_schema)

In [6]:
argo_df.show(3)

+----------+----+-----+------+------+------+-------------------+
|profile_id|pres| temp|   lat|   lon|  psal|               date|
+----------+----+-----+------+------+------+-------------------+
|5905377_14|2.91|1.048|-56.73|148.24|33.739|2018-06-17 16:59:45|
|5905377_14|4.21|1.049|-56.73|148.24|33.738|2018-06-17 16:59:45|
|5905377_14|6.21|1.049|-56.73|148.24|33.738|2018-06-17 16:59:45|
+----------+----+-----+------+------+------+-------------------+
only showing top 3 rows



In [7]:
from pyspark.sql.functions import *

insane_sort = udf(lambda x: [item[0] for item in sorted(x,key=lambda x: x[1])], ArrayType(DoubleType()))

array(argo_df['temp'],array(argo_df['pres']))

argo_df_listed = argo_df.select('profile_id',array(argo_df['temp'],argo_df['pres']).alias('temp_pres'))\
       .groupBy('profile_id').agg(collect_list('temp_pres').alias('temp_pres_list'))
argo_df_listed = argo_df_listed.select('profile_id',insane_sort(argo_df_listed['temp_pres_list']).alias('temp_pres_list'))
argo_df_listed.show()

+-----------+--------------------+
| profile_id|      temp_pres_list|
+-----------+--------------------+
| 5905293_29|[28.567, 28.566, ...|
|4901659_142|[24.296, 24.296, ...|
|5904161_236|[-1.671, -1.657, ...|
|5902385_124|[8.42, 8.42, 8.42...|
| 2902676_53|[30.067, 29.848, ...|
|5903601_240|[14.992, 12.59, 1...|
|3901205_123|[24.722, 24.729, ...|
| 5902476_72|[19.721, 19.72, 1...|
|5903808_221|[29.881, 29.9, 29...|
|5904597_169|[-0.506, -0.49, -...|
|2902656_254|[29.885, 29.894, ...|
| 2902717_10|[29.531, 29.54, 2...|
|4901409_231|[20.468, 20.353, ...|
| 5904767_79|[-0.222, -0.22, -...|
| 5905353_25|[10.136, 10.136, ...|
|4901565_183|[8.667, 8.667, 8....|
|5901691_348|[9.369, 9.371, 9....|
|5903840_227|[29.45, 29.448, 2...|
|5904715_109|[28.984, 28.998, ...|
| 6903232_13|[5.107, 5.108, 5....|
+-----------+--------------------+
only showing top 20 rows



In [25]:
row1 = argo_df_listed.select('temp_pres_list').collect()[0]
row1

Row(temp_pres_list=[28.567, 28.566, 28.566, 28.567, 28.568, 28.566, 28.565, 28.565, 28.566, 28.567, 28.561, 28.543, 28.496, 28.431, 28.417, 28.413, 28.403, 28.396, 28.396, 28.395, 28.382, 28.355, 28.349, 28.344, 28.331, 28.316, 28.31, 28.288, 28.24, 28.189, 28.154, 28.152, 28.145, 28.118, 28.06, 28.038, 28.038, 28.032, 27.991, 27.943, 27.874, 27.667, 27.097, 26.964, 26.933, 26.914, 26.902, 26.895, 26.886, 26.874, 26.847, 26.817, 26.729, 26.565, 26.333, 25.999, 25.456, 25.029, 24.666, 24.514, 24.27, 23.542, 22.41, 22.128, 21.968, 21.907, 21.557, 21.049, 20.637, 20.002, 19.551, 19.007, 18.695, 17.543, 16.814, 16.152, 15.345, 14.576, 14.383, 14.331, 14.295, 14.237, 13.927, 13.877, 13.85, 13.539, 13.255, 13.189, 13.099, 12.889, 12.753, 12.787, 12.762, 12.729, 12.586, 12.356, 12.26, 12.2, 11.959, 11.894, 11.861, 11.851, 11.84, 11.785, 11.762, 11.757, 11.724, 11.623, 11.547, 11.488, 11.472, 11.463, 11.443, 11.405, 11.339, 11.294, 11.266, 11.246, 11.237, 11.243, 11.242, 11.225, 11.209, 11.203

In [27]:
contents = sc.parallelize([row1])
contents

ParallelCollectionRDD[104] at parallelize at PythonRDD.scala:195

In [47]:
contents_rdd = contents.flatMap(lambda x: x)
contents_rdd

PythonRDD[163] at RDD at PythonRDD.scala:53

In [31]:
contents_rdd.take(5)

[[28.567,
  28.566,
  28.566,
  28.567,
  28.568,
  28.566,
  28.565,
  28.565,
  28.566,
  28.567,
  28.561,
  28.543,
  28.496,
  28.431,
  28.417,
  28.413,
  28.403,
  28.396,
  28.396,
  28.395,
  28.382,
  28.355,
  28.349,
  28.344,
  28.331,
  28.316,
  28.31,
  28.288,
  28.24,
  28.189,
  28.154,
  28.152,
  28.145,
  28.118,
  28.06,
  28.038,
  28.038,
  28.032,
  27.991,
  27.943,
  27.874,
  27.667,
  27.097,
  26.964,
  26.933,
  26.914,
  26.902,
  26.895,
  26.886,
  26.874,
  26.847,
  26.817,
  26.729,
  26.565,
  26.333,
  25.999,
  25.456,
  25.029,
  24.666,
  24.514,
  24.27,
  23.542,
  22.41,
  22.128,
  21.968,
  21.907,
  21.557,
  21.049,
  20.637,
  20.002,
  19.551,
  19.007,
  18.695,
  17.543,
  16.814,
  16.152,
  15.345,
  14.576,
  14.383,
  14.331,
  14.295,
  14.237,
  13.927,
  13.877,
  13.85,
  13.539,
  13.255,
  13.189,
  13.099,
  12.889,
  12.753,
  12.787,
  12.762,
  12.729,
  12.586,
  12.356,
  12.26,
  12.2,
  11.959,
  11.894,
  11.861,

In [57]:
schema = StructType([
    StructField("num",ArrayType(DoubleType()),True)])
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Window

#contents_rdd_vec = contents_rdd.map(lambda x:Vectors.dense(x))

contents_df = ss.createDataFrame(contents_rdd)
assemb = VectorAssembler(outputCol="features", inputCols=contents_df.columns)

vec = assemb.transform(contents_df)
vec_indexed = vec.select('features',(rank().over(Window.partitionBy().orderBy('features')).alias('index')))
vec_indexed.show()


# contents_df.show()


+--------------------+-----+
|            features|index|
+--------------------+-----+
|[28.567,28.566,28...|    1|
+--------------------+-----+



In [49]:
contents_df.show()

DataFrame[_1: double, _2: double, _3: double, _4: double, _5: double, _6: double, _7: double, _8: double, _9: double, _10: double, _11: double, _12: double, _13: double, _14: double, _15: double, _16: double, _17: double, _18: double, _19: double, _20: double, _21: double, _22: double, _23: double, _24: double, _25: double, _26: double, _27: double, _28: double, _29: double, _30: double, _31: double, _32: double, _33: double, _34: double, _35: double, _36: double, _37: double, _38: double, _39: double, _40: double, _41: double, _42: double, _43: double, _44: double, _45: double, _46: double, _47: double, _48: double, _49: double, _50: double, _51: double, _52: double, _53: double, _54: double, _55: double, _56: double, _57: double, _58: double, _59: double, _60: double, _61: double, _62: double, _63: double, _64: double, _65: double, _66: double, _67: double, _68: double, _69: double, _70: double, _71: double, _72: double, _73: double, _74: double, _75: double, _76: double, _77: double