In [6]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark.sql.functions as func
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [2]:
def toDoubleSafe(value): 
    try:
        return float(value)
    except ValueError:
        return None

In [3]:
def toIntSafe(value): 
    try:
        return int(value)
    except ValueError:
        return None

In [7]:
argo = sc.textFile("argo_data.csv") \
         .map(lambda x: x.split(",")) \
         .map(lambda x: (x[0], 
                         toDoubleSafe(x[1]), 
                         toDoubleSafe(x[2]),
                         toDoubleSafe(x[3]),
                         toDoubleSafe(x[4]),
                         toDoubleSafe(x[5]),
                         x[6]))
argo_schema = StructType([StructField("profile_id", StringType(), True),
                          StructField("pres", DoubleType(), True),
                          StructField("temp", DoubleType(), True),
                          StructField("lat", DoubleType(), True),
                          StructField("lon", DoubleType(), True),
                          StructField("psal", DoubleType(), True),
                          StructField("date", TimestampType(), True),
                        ])
argo_df = ss.read.csv("argo_data.csv", header=True, schema=argo_schema)

In [11]:
argo_df.show(3)

+----------+----+-----+------+------+------+-------------------+
|profile_id|pres| temp|   lat|   lon|  psal|               date|
+----------+----+-----+------+------+------+-------------------+
|5905377_14|2.91|1.048|-56.73|148.24|33.739|2018-06-17 16:59:45|
|5905377_14|4.21|1.049|-56.73|148.24|33.738|2018-06-17 16:59:45|
|5905377_14|6.21|1.049|-56.73|148.24|33.738|2018-06-17 16:59:45|
+----------+----+-----+------+------+------+-------------------+
only showing top 3 rows



In [46]:
from pyspark.sql.functions import *

insane_sort = udf(lambda x: [item[0] for item in sorted(x,key=lambda x: x[1])], ArrayType(DoubleType()))

array(argo_df['temp'],array(argo_df['pres']))

argo_df_listed = argo_df.select('profile_id',array(argo_df['temp'],argo_df['pres']).alias('temp_pres'))\
       .groupBy('profile_id').agg(collect_list('temp_pres').alias('temp_pres_list'))
argo_df_listed = argo_df_listed.select('profile_id',insane_sort(argo_df_listed['temp_pres_list']).alias('temp_pres_list'))
argo_df_listed.show()

+-----------+--------------------+
| profile_id|      temp_pres_list|
+-----------+--------------------+
| 5905293_29|[28.567, 28.566, ...|
|4901659_142|[24.296, 24.296, ...|
|5904161_236|[-1.671, -1.657, ...|
|5902385_124|[8.42, 8.42, 8.42...|
| 2902676_53|[30.067, 29.848, ...|
|5903601_240|[14.992, 12.59, 1...|
|3901205_123|[24.722, 24.729, ...|
| 5902476_72|[19.721, 19.72, 1...|
|5903808_221|[29.881, 29.9, 29...|
|5904597_169|[-0.506, -0.49, -...|
|2902656_254|[29.885, 29.894, ...|
| 2902717_10|[29.531, 29.54, 2...|
|4901409_231|[20.468, 20.353, ...|
| 5904767_79|[-0.222, -0.22, -...|
| 5905353_25|[10.136, 10.136, ...|
|4901565_183|[8.667, 8.667, 8....|
|5901691_348|[9.369, 9.371, 9....|
|5903840_227|[29.45, 29.448, 2...|
|5904715_109|[28.984, 28.998, ...|
| 6903232_13|[5.107, 5.108, 5....|
+-----------+--------------------+
only showing top 20 rows

