# Encode columns in csv file

Рассмотрим разные способы кодирование данных

Пример:
 - в кодировке __A__ мы хотим видеть `a` с позицией `1`, а `b` с `2`, `c` с `3`
 - в кодировке __B__ мы хотим видеть `a` с позицией `2`, а `b` с `3`, `c` с `1`
 
 `A` трансформируем все колонки 
 
| c1| c2 |
|-----|-----|
| a | a|
| b | b|
| c | b|

Результат

| c1_enc| c2_enc |
|-----|-----|
| 1 | 1|
| 2 | 2|
| 3 | 2|

Если `col1` кодирет `A` и `col2` кодирует `B`, то результат

| c1_enc| c2_enc |
|-----|-----|
| 1 | 2|
| 2 | 3|
| 3 | 3|

### Initialize Spark session / load data


In [111]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkSession.builder \
     .master("local") \
     .getOrCreate()
        
sqlContext = SQLContext(sc)
df = sqlContext.read.csv("data-1600cols.csv", header=True)

In [112]:
sc.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('hive.metastore.warehouse.dir', 'file:/home/jovyan/work/spark-warehouse/'),
 ('spark.app.name', 'Encode multiple columns'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1566927012283'),
 ('spark.driver.port', '43689'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '172.17.0.7')]

In [113]:
print('Number of rows: {}\nNumber of columns: {}'.format(df.count(),len(df.columns)))

Number of rows: 1000
Number of columns: 1600


In [114]:
# есть пустые?
df.where(df.V2.isNull()).collect()

[]

In [115]:
# select.show
df.select('V1','V2','V3').show()

+---+---+---+
| V1| V2| V3|
+---+---+---+
|  j|  n|  d|
|  d|  n|  w|
|  p|  h|  a|
|  b|  h|  e|
|  z|  x|  u|
|  b|  e|  v|
|  y|  t|  x|
|  i|  r|  e|
|  x|  e|  g|
|  l|  j|  z|
|  l|  v|  l|
|  z|  n|  h|
|  s|  m|  c|
|  g|  m|  f|
|  i|  p|  n|
|  i|  f|  b|
|  u|  n|  j|
|  s|  o|  e|
|  k|  y|  c|
|  h|  b|  i|
+---+---+---+
only showing top 20 rows



## Первый способ

Используем функцию `translate` из `pyspark.sql`, применим её при создании новой колонки `withColumn`.

In [116]:
import pyspark.sql.functions as f

test_df = sqlContext.createDataFrame([('a', 'a'), ('b', 'b'), ('c', 'b')], ['c1', 'c2'])
test_df.show()

chars = "abc"
A = "123" # encoding A
B = "231" # encoding B


for col_name in ["c1", "c2"]:  
    test_df = test_df.withColumn(col_name+'_enc', f.translate(f.col(col_name), "abcd", A))

test_df.show()

+---+---+
| c1| c2|
+---+---+
|  a|  a|
|  b|  b|
|  c|  b|
+---+---+

+---+---+------+------+
| c1| c2|c1_enc|c2_enc|
+---+---+------+------+
|  a|  a|     1|     1|
|  b|  b|     2|     2|
|  c|  b|     3|     2|
+---+---+------+------+



Увеличим размености и число вариантов

In [117]:
import string
import random

# seed
random.seed(30)

chars = string.ascii_lowercase
encodingA = ''.join(random.choice(string.digits) for i in range(len(chars)))
encodingB = ''.join(random.choice(string.digits) for i in range(len(chars)))

print("Encodings:")
print(chars)
print(encodingA)
print(encodingB)
print("-"*26)
new_df=df

for col_name in ["V1", "V3"]:  # apply encodingA to columns V1, V3
    new_df=new_df.withColumn(col_name+'_enc',f.translate(f.col(col_name), chars, encodingA))
for col_name in ["V2", "V4"]:  # apply encodingB to columns V2, V4
    new_df=new_df.withColumn(col_name+'_enc',f.translate(f.col(col_name), chars, encodingB))
    
new_df.select("V1","V2","V3","V4", "V1_enc", "V2_enc", "V3_enc", "V4_enc").show()

Encodings:
abcdefghijklmnopqrstuvwxyz
84909340662170830129865816
03946914819742444812351068
--------------------------
+---+---+---+---+------+------+------+------+
| V1| V2| V3| V4|V1_enc|V2_enc|V3_enc|V4_enc|
+---+---+---+---+------+------+------+------+
|  j|  n|  d|  m|     6|     2|     0|     4|
|  d|  n|  w|  y|     0|     2|     5|     6|
|  p|  h|  a|  h|     3|     4|     8|     4|
|  b|  h|  e|  t|     4|     4|     9|     2|
|  z|  x|  u|  d|     6|     0|     8|     4|
|  b|  e|  v|  j|     4|     6|     6|     1|
|  y|  t|  x|  w|     1|     2|     8|     1|
|  i|  r|  e|  q|     6|     8|     9|     4|
|  x|  e|  g|  s|     8|     6|     4|     1|
|  l|  j|  z|  h|     1|     1|     6|     4|
|  l|  v|  l|  w|     1|     5|     1|     1|
|  z|  n|  h|  z|     6|     2|     0|     8|
|  s|  m|  c|  z|     2|     4|     9|     8|
|  g|  m|  f|  j|     4|     4|     3|     1|
|  i|  p|  n|  h|     6|     4|     0|     4|
|  i|  f|  b|  r|     6|     9|     4|     8|
|  u|  

Именование колонок

In [118]:
new_df=df

for col_name in ["V1", "V3"]:  # apply encodingA to columns V1, V2
    new_df = new_df.withColumn(col_name,f.translate(f.col(col_name), chars, encodingA))
for col_name in ["V2", "V4"]:  # apply encodingB to columns V3, V4
    new_df = new_df.withColumn(col_name,f.translate(f.col(col_name), chars, encodingB))
    
new_df.select("V1","V2","V3","V4").show(3)

+---+---+---+---+
| V1| V2| V3| V4|
+---+---+---+---+
|  6|  2|  0|  4|
|  0|  2|  5|  6|
|  3|  4|  8|  4|
+---+---+---+---+
only showing top 3 rows



In [119]:
cols_e = ["V"+str(i) for i in range(2,5,2)]
cols_o = ["V"+str(i) for i in range(1,4,2)]

print(cols_e)
print(cols_o)

new_df=df

for col_name in cols_o:  # apply encodingA to columns with even numbers
    new_df=new_df.withColumn(col_name,f.translate(f.col(col_name), chars, encodingA))
for col_name in cols_e:  # apply encodingB to odd columns 
    new_df=new_df.withColumn(col_name,f.translate(f.col(col_name), chars, encodingB))
    
new_df.select(["V"+str(i) for i in range(1,5)]).show(3)

['V2', 'V4']
['V1', 'V3']
+---+---+---+---+
| V1| V2| V3| V4|
+---+---+---+---+
|  6|  2|  0|  4|
|  0|  2|  5|  6|
|  3|  4|  8|  4|
+---+---+---+---+
only showing top 3 rows



## Второй способ
Напишем `udf` (user-defined functions)

In [120]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType

# определим кодировку
o = ["abcdefghijklmnopqrstuvwxyz", encodingA]

def enc(*a):
    # перекодируем
    s=a[0]
    for i in range(len(o[0])): 
        if s==o[0][i]:      
            return o[1][i]
    return s

# создадим udf
encode_udf = udf(enc, StringType())

cols_o = ["V"+str(i) for i in range(7) if i%2==1]
print(cols_o)

(
df.select("V1","V3","V5", 
           encode_udf("V1").alias("V1_enc"),
           encode_udf("V3").alias("V3_enc"),
           encode_udf("V5").alias("V5_enc"))
    .show(10) 
)


['V1', 'V3', 'V5']
+---+---+---+------+------+------+
| V1| V3| V5|V1_enc|V3_enc|V5_enc|
+---+---+---+------+------+------+
|  j|  d|  s|     6|     0|     2|
|  d|  w|  l|     0|     5|     1|
|  p|  a|  w|     3|     8|     5|
|  b|  e|  x|     4|     9|     8|
|  z|  u|  b|     6|     8|     4|
|  b|  v|  u|     4|     6|     8|
|  y|  x|  z|     1|     8|     6|
|  i|  e|  k|     6|     9|     2|
|  x|  g|  s|     8|     4|     2|
|  l|  z|  l|     1|     6|     1|
+---+---+---+------+------+------+
only showing top 10 rows



In [121]:
# применим в select перебор колонок через цикл
new_df=df.select([encode_udf("V"+str(i)).alias("V"+str(i)+"_enc") for i in range(1,100,2)])
new_df.select(["V"+str(i)+"_enc" for i in range(1,21,2)]).show(10)

+------+------+------+------+------+-------+-------+-------+-------+-------+
|V1_enc|V3_enc|V5_enc|V7_enc|V9_enc|V11_enc|V13_enc|V15_enc|V17_enc|V19_enc|
+------+------+------+------+------+-------+-------+-------+-------+-------+
|     6|     0|     2|     6|     9|      8|      2|      2|      3|      6|
|     0|     5|     1|     8|     0|      2|      9|      6|      8|      2|
|     3|     8|     5|     4|     8|      3|      9|      0|      2|      9|
|     4|     9|     8|     0|     9|      0|      9|      2|      8|      0|
|     6|     8|     4|     0|     9|      2|      8|      6|      6|      6|
|     4|     6|     8|     5|     8|      6|      5|      6|      6|      4|
|     1|     8|     6|     0|     4|      8|      4|      5|      5|      1|
|     6|     9|     2|     5|     8|      8|      5|      4|      0|      1|
|     8|     4|     2|     2|     2|      2|      9|      7|      8|      0|
|     1|     6|     1|     9|     6|      2|      6|      1|      1|      2|