In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
sc = SparkContext() # dòng này phải nằm trước dòng dưới
spark = SparkSession.builder.config("spark.driver.memory", "10g").getOrCreate()

In [None]:
spark.stop()

# ETL for one day

In [3]:
rdd = sc.textFile("../FPT_Test/DataSampleTest/logt21.txt") \
    .map(lambda line: eval(line)) \
    .collect()
    
df_input = spark.createDataFrame(rdd)
df_input.show()

+-----+--------+--------------------+---------+---------+----------+-------------+-----------+--------+------------------+------------+--------+------+------------+---------+--------------------+------------+---------+-----+------------+-----------+--------------+---------------+------------------+------------+--------------------+--------------------+--------------------+---------+-------------+--------------------+-------------+-------------------+-----+-------------+--------------------+---------------+----------+---------+-------+--------+----+----+
|AppId| AppName|             BoxTime|ChapterID| Contract|CustomerID|DefaultGetway|  Directors|Duration|ElapsedTimePlaying|       Event|Firmware|Folder|          Ip|   ItemId|            ItemName|ListOnFolder|LocalType|LogId|         Mac| PrimaryDNS|PublishCountry|RealTimePlaying|            Screen|SecondaryDNS|             Session|     SessionMainMenu|      SessionSubMenu|SubMenuId|   SubnetMask|                 Url|       ip_wan|      

In [4]:
df_input.count()

97248

In [37]:
df_category = df_input.select("Contract", "AppId", "AppName", "Duration") \
    .where("Contract is not null and Contract != ''") \
    .withColumn("AppNameCategory", 
        when(col("AppId") == "CHANNEL", "TVDuration")
        .when(col("AppId") == "VOD", "MovieDuration")
        .when(col("AppId") == "RELAX", "RelaxDuration")
        .when(col("AppId") == "CHILD", "ChildDuration")
        .when(col('AppName') == 'KPLUS', "SportDuration")
        .otherwise("Unknown")
    ) \
    .orderBy(col("Contract"))
    
df_category.show()

+---------+-----+-------+--------+---------------+
| Contract|AppId|AppName|Duration|AppNameCategory|
+---------+-----+-------+--------+---------------+
|AGD021590|  VOD|    VOD|    NULL|  MovieDuration|
|AGD021590| IPTV|   IPTV|      21|        Unknown|
|AGD021590| HOME|   IPTV|    NULL|        Unknown|
|AGD021590| IPTV|   IPTV|    NULL|        Unknown|
|AGD021590|RELAX|   IPTV|    NULL|  RelaxDuration|
|AGD021590| IPTV|   IPTV|    NULL|        Unknown|
|AGD021590|  VOD|    VOD|    3964|  MovieDuration|
|AGD021590| IPTV|   IPTV|      14|        Unknown|
|AGD021590| IPTV|   IPTV|       2|        Unknown|
|AGD021590| IPTV|   IPTV|    NULL|        Unknown|
|AGD021590| IPTV|   IPTV|    NULL|        Unknown|
|AGD021590| IPTV|   IPTV|    NULL|        Unknown|
|AGD021590| IPTV|   IPTV|      16|        Unknown|
|AGD021590|RELAX|   IPTV|    NULL|  RelaxDuration|
|AGD021590| IPTV|   IPTV|    NULL|        Unknown|
|AGD021590| IPTV|   IPTV|    NULL|        Unknown|
|AGD021590| IPTV|   IPTV|    NU

In [38]:
df_category2 = df_category.withColumn("Duration", when(col("Duration").isNull(), 0).otherwise(col("Duration"))) \
    .groupBy("Contract", "AppNameCategory").agg(sum("Duration").alias("Duration")) \
    .orderBy(col("Contract"))
    
df_category2.show()

+---------+---------------+--------+
| Contract|AppNameCategory|Duration|
+---------+---------------+--------+
|AGD021590|  RelaxDuration|       0|
|AGD021590|  MovieDuration|    3964|
|AGD021590|        Unknown|   35722|
|AGFDN0001|        Unknown|       0|
|BDD029197|  RelaxDuration|       0|
|BDD029197|        Unknown|   62973|
|BDD029197|  ChildDuration|       0|
|BDD029197|  MovieDuration|   67247|
|BDD056879|  RelaxDuration|       0|
|BDD056879|        Unknown|  192090|
|BDD056879|  MovieDuration|  151866|
|BDD060056|        Unknown|  256458|
|BDD060056|  MovieDuration|       0|
|BDD060056|  ChildDuration|       0|
|BDD061131|        Unknown|       0|
|BDD065085|  RelaxDuration|       0|
|BDD065085|  MovieDuration|  157049|
|BDD065085|        Unknown|  123860|
|BDD065085|  ChildDuration|       0|
|BDD065591|        Unknown|       0|
+---------+---------------+--------+
only showing top 20 rows



In [40]:
df_duration_by_contract = df_category2.groupBy("Contract").pivot("AppNameCategory").agg(sum("Duration").alias("Duration"))
df_duration_by_contract.show()

+---------+-------------+-------------+-------------+-------+
| Contract|ChildDuration|MovieDuration|RelaxDuration|Unknown|
+---------+-------------+-------------+-------------+-------+
|HPFD05148|         NULL|         NULL|            0| 141340|
|DNFD05695|         NULL|         7366|            0|   9657|
|BDD061131|         NULL|         NULL|         NULL|      0|
|SGH100802|         NULL|            0|         NULL|  57839|
|SGDN00211|         NULL|        42084|            0| 612609|
|BGD017419|            0|            0|         NULL| 405334|
|LSFD03035|            0|         NULL|         NULL| 100959|
|HND443796|         NULL|         NULL|         NULL| 190077|
|NAD051106|         NULL|            0|         NULL|      0|
|HND373545|         NULL|         NULL|         NULL|      0|
|CMD013943|            0|       422576|         NULL|  26495|
|BTD028492|         NULL|       553245|            0|  56886|
|BDD065591|         NULL|         NULL|         NULL|      0|
|HNDN000

In [3]:
def etl_one_day(file_path) -> DataFrame:
    # prep input
    rdd = sc.textFile(file_path) \
        .map(lambda line: eval(line)) \
        .collect()
    
    df_input = spark.createDataFrame(rdd)
    
    # categorize app and based on each category, calc sum(duration) 
    df_category = df_input.select("Contract", "AppId", "AppName", "Duration") \
    .where("Contract is not null and Contract != ''") \
    .withColumn("AppNameCategory", 
        when(col("AppId") == "CHANNEL", "TVDuration")
        .when(col("AppId") == "VOD", "MovieDuration")
        .when(col("AppId") == "RELAX", "RelaxDuration")
        .when(col("AppId") == "CHILD", "ChildDuration")
        .when(col('AppName') == 'KPLUS', "SportDuration")
        .otherwise("Unknown")
    ) \
    .withColumn("Duration", when(col("Duration").isNull(), 0).otherwise(col("Duration"))) \
    .groupBy("Contract", "AppNameCategory")\
    .agg(sum("Duration").alias("Duration")) \
    
    # pivot table based on labels in `AppNameCategory`
    df_duration_by_contract = df_category.groupBy("Contract").pivot("AppNameCategory").agg(sum("Duration").alias("Duration"))
    
    return df_duration_by_contract

In [4]:
# Test
df = etl_one_day("../FPT_Test/DataSampleTest/logt21.txt")
df.show()

+---------+-------------+-------------+-------------+-------+
| Contract|ChildDuration|MovieDuration|RelaxDuration|Unknown|
+---------+-------------+-------------+-------------+-------+
|HPFD05148|         NULL|         NULL|            0| 141340|
|DNFD05695|         NULL|         7366|            0|   9657|
|BDD061131|         NULL|         NULL|         NULL|      0|
|SGH100802|         NULL|            0|         NULL|  57839|
|SGDN00211|         NULL|        42084|            0| 612609|
|BGD017419|            0|            0|         NULL| 405334|
|LSFD03035|            0|         NULL|         NULL| 100959|
|HND443796|         NULL|         NULL|         NULL| 190077|
|NAD051106|         NULL|            0|         NULL|      0|
|HND373545|         NULL|         NULL|         NULL|      0|
|CMD013943|            0|       422576|         NULL|  26495|
|BTD028492|         NULL|       553245|            0|  56886|
|BDD065591|         NULL|         NULL|         NULL|      0|
|HNDN000

# ETL for multiple days

Hướng 2 : Đọc và xử lý từng file , sau đó gộp tất cả kết quả lại và group by sum 


In [6]:
import os

folder_path = "../FPT_Test/DataSampleTest/"
log_files = [f for f in os.listdir(folder_path) if f.startswith("log")]
print(log_files)

['logt21.txt', 'logt22.txt', 'logt23.txt', 'logt24.txt', 'logt25.txt', 'logt31.txt', 'logt32.txt']


In [None]:

# map each element IN PARALLEL
list_log_df = list(map(etl_one_day, [folder_path + log_file for log_file in log_files])) 

In [7]:
###
list_log_df = list(map(etl_one_day, [folder_path + log_file for log_file in log_files])) 

In [8]:
len(list_log_df)

7

In [None]:
def mergeDfsWithDifferentColumns(df1, df2):
    df_merged = df1.unionByName(df2, allowMissingColumns=True)
    return df_merged

In [10]:
def process_data_2_days(df1, df2):
    print("---------------Merge and process df of 2 days---------------")
    
    df_merge = df1.unionByName(df2, allowMissingColumns=True)
    
    list_field_names = df_merge.schema.fieldNames() # contain Category column
    list_field_names.remove("Contract")
    
    sum_category_duration = list(map(
        lambda field_name: sum(field_name).alias(field_name),
        list_field_names
    ))
    
    df_processed = df_merge.groupBy("Contract") \
        .agg(*sum_category_duration)
        
    return df_processed

In [11]:
from functools import reduce

df_final = reduce(process_data_2_days, list_log_df[:2])

---------------Merge and process df of 2 days---------------


In [12]:
df_final.count()

202

In [23]:
list_field_names = df_final.schema.fieldNames()
list_field_names.remove("Contract")

list_sum_duration = list(map(
    lambda field_name: sum(field_name).alias(field_name), 
    list_field_names
))

df_duration_by_contract_final = df_final.groupBy("Contract").agg(*list_sum_duration)

print(df_duration_by_contract_final.count())
df_duration_by_contract_final.show()

1825
+---------+-------------+-------------+-------------+-------+
| Contract|ChildDuration|MovieDuration|RelaxDuration|Unknown|
+---------+-------------+-------------+-------------+-------+
|HPFD05148|         NULL|         NULL|            0| 141340|
|DNFD05695|         NULL|         7366|            0|   9657|
|BDD061131|         NULL|         NULL|         NULL|      0|
|SGH100802|         NULL|            0|         NULL|  57839|
|SGDN00211|            0|       106545|            0|1146261|
|BGD017419|            0|            0|         NULL| 405334|
|LSFD03035|            0|         NULL|         NULL| 100959|
|HND443796|         NULL|         NULL|         NULL| 190077|
|NAD051106|         NULL|            0|         NULL|      0|
|HND373545|         NULL|         NULL|         NULL|      0|
|CMD013943|            0|       422576|         NULL|  26495|
|BTD028492|         NULL|       553245|            0|  56886|
|BDD065591|         NULL|         NULL|         NULL|      0|
|HN