In [6]:
from typing import List
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import types as T
from pyspark.sql import functions as F
from collections import Counter
from pyspark import SparkContext, RDD
from csv import reader
import itertools
import rdd_util

In [4]:
from pyspark import SparkContext
spark = SparkSession \
    .builder \
    .appName("project") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [5]:
path = '/user/hm74/NYCOpenData/2232-dj5q.tsv.gz'

In [30]:
rdd = sc.textFile(path,1).mapPartitions(lambda x: reader(x, delimiter='\t')).zipWithIndex()

In [87]:
header = rdd.filter(lambda x: x[1] == 0) \
    .map(lambda x: (x[0])).collect()[0]  # extract the first part, ignore idx
rows = rdd.filter(lambda x: x[1] != 0).map(lambda x: x[0])

In [154]:
# Transform to [(col_idx, value),(col_idx, value)...]
def d(x,header=header):
    return [(header[i], x[i]) for i in range(len(x))]

items = rows.flatMap(d).cache()

In [308]:
items.take(5)

[('category',
  'number of individuals who are on wait-list - DYCD-administered transitional independent living facilities'),
 ('single men', ''),
 ('single women', ''),
 ('total single adults', ''),
 ('families with children', '')]

In [155]:
def is_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

In [156]:
def is_float(value):
    if('.' not in value):
        return False
    try:
        float(value)
        return True
    except ValueError:
        return False

In [157]:
a= [None]*6

## Transform to [(col_idx,[value,type]),...]

In [158]:
def mapd(x):
    # [col_idx, (value, type)]
    res = (x[0],[x[1],None])
    if(x[1]==''):
        res[1][1] = 'empty'
    elif(is_int(x[1])):
        res[1][1]= 'int'
    elif(is_float(x[1])):
        res[1][1] = 'real'
    else:
        res[1][1] = 'text'
    return res
maped_items = items.map(mapd)

In [201]:
maped_items.take(12)

[('category',
  ['number of individuals who are on wait-list - DYCD-administered transitional independent living facilities',
   'text']),
 ('single men', ['', 'empty']),
 ('single women', ['', 'empty']),
 ('total single adults', ['', 'empty']),
 ('families with children', ['', 'empty']),
 ('total families', ['', 'empty']),
 ('total adults in families', ['', 'empty']),
 ('total children', ['', 'empty']),
 ('data period', ['201810', 'int']),
 ('category',
  ['Average Length of Stay: DHS -administered facility (by type, excluding drop-in and faith-based)',
   'text']),
 ('single men', ['', 'empty']),
 ('single women', ['', 'empty'])]

## Test aggregate

In [163]:
def seqOp(local_rest, row: List):
    if (row[1]):
        local_rest[row[0]] += 1
    return local_rest

def combOp(local_1, local_2):
    return [local_1[0]+local_2[0],local_1[1]+local_2[1]]

listRDD = sc.parallelize([(0,'a'),(1,'b'),(0,'a'),(1,'b'),(1,'c')], 2)
listRDD.aggregate([0,0],seqOp, combOp)

[2, 3]

## Count the null-empty, empty, total 

In [213]:
def seqFunc(local, x):
    res= [i for i in local];
    if(x[1]!='empty'):
        res[0] = local[0]+1
    else:
        res[1] = local[1]+1
    res[2] = local[2]+1
    return res
    
combFunc = (lambda x, y: (x[0]+y[0], x[1] + y[1],x[2]+y[2]))

count = maped_items.aggregateByKey((0,0,0),seqFunc,combFunc)

In [214]:
count.collect()

[('category', [176, 9, 185]),
 ('single men', [89, 96, 185]),
 ('single women', [44, 141, 185]),
 ('total single adults', [44, 141, 185]),
 ('families with children', [33, 152, 185]),
 ('total families', [33, 152, 185]),
 ('total adults in families', [34, 151, 185]),
 ('total children', [54, 131, 185]),
 ('data period', [185, 0, 185])]

## Count frequency and distinct num

In [267]:
freq_items = items.map(lambda x: ((x[0],x[1]),1)).aggregateByKey((0,0),(lambda x, y: (0, x[1] + 1)),(lambda x, y: (x[1] + y[1])))

In [268]:
freq_items.take(5)

[(('category',
   'number of individuals who are on wait-list - DYCD-administered transitional independent living facilities'),
  (0, 11)),
 (('single men', ''), (0, 96)),
 (('single women', ''), (0, 141)),
 (('total single adults', ''), (0, 141)),
 (('families with children', ''), (0, 152))]

In [269]:
freq_items.map(lambda x: (x[0][0],x[0][1],x[1][1])).take(5)

[('category',
  'number of individuals who are on wait-list - DYCD-administered transitional independent living facilities',
  11),
 ('single men', '', 96),
 ('single women', '', 141),
 ('total single adults', '', 141),
 ('families with children', '', 152)]

In [270]:
freq_items = freq_items.map(lambda x: ((x[0][0]),(x[0][1],x[1][1])))

In [310]:
freq_items.take(10)

[('category',
  ('number of individuals who are on wait-list - DYCD-administered transitional independent living facilities',
   11)),
 ('single men', ('', 96)),
 ('single women', ('', 141)),
 ('total single adults', ('', 141)),
 ('families with children', ('', 152)),
 ('total families', ('', 152)),
 ('total adults in families', ('', 151)),
 ('total children', ('', 131)),
 ('data period', ('201810', 16)),
 ('category',
  ('Average Length of Stay: DHS -administered facility (by type, excluding drop-in and faith-based)',
   11))]

In [311]:
test = freq_items.sortBy(lambda x: x[1][1],ascending=False).groupByKey()

In [303]:
import itertools

In [305]:
list(itertools.islice(test.take(1)[0][1], 5))

[('', 152), ('7', 5), ('5', 4), ('8', 3), ('13', 3)]

In [322]:
def top5(x):
    res = [len(x)]
    return [len(x),*list(itertools.islice(x,5))]        

In [323]:
test.mapValues(top5).take(10)

[('families with children',
  [18, ('', 152), ('7', 5), ('5', 4), ('8', 3), ('13', 3)]),
 ('total families', [18, ('', 152), ('7', 5), ('5', 4), ('8', 3), ('13', 3)]),
 ('total adults in families',
  [20, ('', 151), ('7', 5), ('8', 3), ('4', 3), ('3', 3)]),
 ('single women', [41, ('', 141), ('60', 2), ('27', 2), ('38', 2), ('61', 2)]),
 ('total single adults',
  [39, ('', 141), ('117', 3), ('77', 2), ('42', 2), ('113', 2)]),
 ('total children',
  [38, ('', 131), ('3', 3), ('0.95', 3), ('10', 3), ('5', 3)]),
 ('single men',
  [71, ('', 96), ('0.92', 7), ('0.91', 4), ('0.93', 3), ('0.89', 3)]),
 ('data period',
  [11,
   ('201908', 19),
   ('201905', 18),
   ('201906', 18),
   ('201907', 18),
   ('201810', 16)]),
 ('category',
  [18,
   ('number of individuals who are on wait-list - DYCD-administered transitional independent living facilities',
    11),
   ('Average Length of Stay: DHS -administered facility (by type, excluding drop-in and faith-based)',
    11),
   ('Average Length of S

In [358]:
def generate_distinct_top5(items: RDD) -> List:
    freq_items = items.map(lambda x: ((x[0], x[1]), 1)) \
        .aggregateByKey((0, 0),
                        (lambda x, y: (0, x[1] + 1)),
                        (lambda x, y: (x[1] + y[
                            1]))) \
        .map(lambda x: ((x[0][0]), (x[0][1], x[1][1])))
    sorted_grouped_freq_items = freq_items.sortBy(lambda x: x[1][1], ascending=False).groupByKey()
    res = sorted_grouped_freq_items.mapValues(lambda x: (len(x),list(itertools.islice(x,5))))
    return res

def generate_null_empty(maped_items: RDD) -> List:
    def seqFunc(local, x):
        res = [i for i in local];
        if (x[1] != 'empty'):
            res[0] = local[0] + 1
        else:
            res[1] = local[1] + 1
        res[2] = local[2] + 1
        return res

    combFunc = (lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]))
    count = maped_items.aggregateByKey((0, 0, 0), seqFunc, combFunc)
    # null-empty, empty, total
    # [('category', [176, 9, 185]),...]
    return count

In [359]:
res1= generate_null_empty(maped_items)
res2 = generate_distinct_top5(items)

In [361]:
res2.take(1)

[('families with children',
  (18, [('', 152), ('7', 5), ('5', 4), ('8', 3), ('13', 3)]))]

In [362]:
res1.take(1)

[('category', [176, 9, 185])]

In [386]:
flat_res = res1.join(res2).map(lambda x: (x[0],(*x[1][0],*x[1][1]))).collect()

In [387]:
flat_res

[('category',
  (176,
   9,
   185,
   18,
   [('number of individuals who are on wait-list - DYCD-administered transitional independent living facilities',
     11),
    ('Average Length of Stay: DHS -administered facility (by type, excluding drop-in and faith-based)',
     11),
    ('Average Length of Stay: DYCD -administered crisis facility', 11),
    ('number of unduplicated persons - DYCD-administered facilities', 11),
    ('number of unduplicated persons - DYCD-administered crisis shelters',
     11)])),
 ('single men',
  (89,
   96,
   185,
   71,
   [('', 96), ('0.92', 7), ('0.91', 4), ('0.93', 3), ('0.89', 3)])),
 ('single women',
  (44, 141, 185, 41, [('', 141), ('60', 2), ('27', 2), ('38', 2), ('61', 2)])),
 ('total single adults',
  (44,
   141,
   185,
   39,
   [('', 141), ('117', 3), ('77', 2), ('42', 2), ('113', 2)])),
 ('families with children',
  (33, 152, 185, 18, [('', 152), ('7', 5), ('5', 4), ('8', 3), ('13', 3)])),
 ('total families',
  (33, 152, 185, 18, [('', 1

In [375]:
len(header)

9

In [379]:
columns=[]
for res in flat_res:
    column_data = {
        'column_name': res[0],
        'number_non_empty_cells': res[1][0],
        'number_empty_cells': res[1][1],
        'number_distinct_values': res[1][3],
        'frequent_values': res[1][4]
    }
    columns.append(column_data)