# 개요
* 참여중인 데이터 엔지니어링 스터디에서 배우는 내용 정리
  * 데이터 수집, 정제 : pyspark, airflow
  * 저장 : elasticsearch
  * 시각화 : kibana

* 2주차 과제1 : Dataframe/RDD API 실습 (다른 글)
* 2주차 과제2 : 데이터셋 확인하고 스키마 고민하기 (현재 글)
  * printSchema()로 인식된 스키마 확인
  * 날짜데이터지만 string인 필드를 timestamp로 변경하고 json으로 스키마 저장
  * 저장해둔 json스키마로 데이터 읽은 후 데이터 확인
* 2주차 과제3 : 데이터모델 결정하기 (현재 글)
  * 수업에서 제시한 기본 3가지 모델 사용 : github의 Top10 repo/User, Daily statistics
  * 간단히 만들어 볼 모델 1가지 : pytorch repo에서 가장 issue를 활발히 진행한 Top10 User
    * 위 모델에 대한 예상 스키마 작성해보기

# 과제 - 2주차

## 데이터셋 확인 및 스키마 고민하기

* 아래의 순서로 진행
  * spark가 확인한 스키마를 json으로 저장
  * 저장된 스키마 확인 후, 수정 필요한 부분만 수정
  * 해당 json으로 `spark.read.schema(schema_to_read).json("../data/*.json.gz")`해서 파일 읽고 확인하기
  * 저장한 스키마를 json파일로 저장하기

### 스키마 확인, json변환

In [None]:
# 최초 로딩 및 printSchema
github = spark.read.json("../data/*.json.gz")
github.printSchema()

                                                                                

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (nul

24/08/25 17:24:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [None]:
# Spark가 확인한 스키마를 json으로 저장
import json

github_schema = json.loads(github.schema.json())

### 컬럼명 기준 타입이 이상한 것 확인하고 타입 수정하기

* 컬럼명을 입력하면 루트부터의 구조('payload.forkee.pushed_at' 등)를 알려주는 함수 작성
* select문으로 데이터 형태 확인 후 필요한 경우 데이터 타입 수정
  * 예를 들어, `created_at`은 string으로 되어있어 timestamp로 타입을 변환함 

In [None]:
# printSchema의 스키마를 보다가 의심되는 컬럼이 있으면 root부터의 구조를 보여주는 함수 by GPT
def find_parents(data, target_name, current_path=[]):
    paths = []
    if isinstance(data, dict):
        if 'name' in data and data['name'] == target_name:
            # Collect path to the target and the names of its ancestors
            paths.append(current_path)
        # Recursively search within this dictionary
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                paths.extend(find_parents(value, target_name, current_path + [data.get('name', None)]))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            paths.extend(find_parents(item, target_name, current_path))
    return paths

def format_paths(paths):
    formatted_paths = []
    for path in paths:
        # Filter out None values and join the path
        formatted_path = '.'.join(filter(None, path))
        formatted_paths.append(formatted_path)
    return formatted_paths

# Format and print the names of the ancestors
name_of_column = 'pushed_at'
for path in format_paths(find_parents(github_schema, name_of_column)):
    print(f'{path}.{name_of_column}')

payload.forkee.pushed_at
payload.pull_request.base.repo.pushed_at
payload.pull_request.head.repo.pushed_at


In [None]:
# 의심되는 컬럼의 데이터 형태 확인 with select문
github.select(
    col('payload.forkee.pushed_at')
).distinct().show()


[Stage 86:>                                                         (0 + 1) / 1]

+--------------------+
|           pushed_at|
+--------------------+
|2024-02-20T15:01:04Z|
|2024-05-19T04:16:24Z|
|2024-05-19T03:32:00Z|
|2024-05-10T16:43:22Z|
|2024-05-08T13:46:33Z|
|2021-11-14T06:52:01Z|
|2024-05-14T14:26:38Z|
|2024-05-09T13:46:19Z|
|2024-05-01T22:05:15Z|
|2024-05-18T16:06:07Z|
|2024-02-07T03:31:38Z|
|2024-05-11T19:54:42Z|
|2024-05-15T12:37:12Z|
|2024-05-19T14:51:30Z|
|2024-05-18T18:56:16Z|
|2024-05-17T13:05:15Z|
|2024-05-19T10:01:48Z|
|2024-05-18T04:16:44Z|
|2024-02-01T07:50:08Z|
|2024-05-19T13:56:05Z|
+--------------------+
only showing top 20 rows




                                                                                

In [None]:
# 수정 필요한 컬럼명은 스키마 수정
dict_typetable = {'timestamp':['created_at', 'updated_at','closed_at','merged_at','pushed_at']}

def update_type(data, dict_typetable):
    if isinstance(data, dict):
        # Check if the dictionary contains 'fields'
        if 'fields' in data:
            for item in data['fields']:
                if isinstance(item['type'], dict) and 'fields' in item['type']:
                    # Recursively process nested 'fields'
                    update_type(item['type'], dict_typetable)
                else:
                    for key, list_column in dict_typetable.items():
                        if item.get('name') in list_column:
                            item['type'] = key # key=type (timestamp 등)
        else:
            # Handle nested structures
            for key, value in data.items():
                update_type(value)
    elif isinstance(data, list):
        for item in data:
            update_type(item)

update_type(github_schema, dict_typetable)

### 수정한 스키마로 파일 읽어서 확인하기

* `spark.read.schema(schema_to_read).json('file.json')`으로 읽기
* 변경된 데이터 확인하기 : 2024-02-20T15:01:04Z → 2024-05-19 12:00:33
* printSchema로 변경된 타입 확인하기

In [None]:
# 저장한 스키마로 파일 읽기 (빠른 확인을 위해 1개의 gh archive데이터만 사용)
from pyspark.sql.types import StructType

schema_to_read = StructType.fromJson(github_schema)
df = spark.read.schema(schema_to_read).json("../data/gh_archive/2024-07-01-14.json.gz")

In [None]:
# 변경된 데이터 확인
import pyspark.sql.functions as F
df.select(
    F.col("payload.forkee.pushed_at")
).distinct().show()


[Stage 1:>                                                          (0 + 1) / 1]

+-------------------+
|          pushed_at|
+-------------------+
|2024-05-19 12:00:33|
|2024-05-13 22:33:41|
|2021-11-26 09:57:24|
|2024-05-03 16:34:07|
|2024-04-30 03:16:52|
|2024-03-17 20:21:22|
|2024-05-18 13:28:47|
|2024-05-19 14:25:45|
|2024-04-11 23:36:42|
|2021-01-06 17:18:27|
|2023-11-11 18:39:26|
|2024-05-19 05:03:02|
|2021-11-13 05:20:38|
|2024-05-13 23:53:03|
|2024-05-16 01:11:03|
|2024-05-17 21:07:06|
|2021-10-29 01:29:12|
|2024-05-08 12:18:27|
|2024-03-18 05:24:37|
|2024-02-28 19:55:17|
+-------------------+
only showing top 20 rows




                                                                                

In [None]:
# 변경된 스키마 확인
df.printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (

### 확인완료 후 재사용을 위해 스키마를 json파일로 저장

In [None]:
# 스키마 json으로 저장
output_file_path = 'github_schema.json'

with open(output_file_path, 'w') as json_file:
    json.dump(github_schema, json_file, indent=4)  # `indent` for pretty-printing

print(f"Data has been saved to {output_file_path}")

## 데이터셋 저장 스키마 결정 & 공유

In [None]:
#json파일로 저장해 둔 스키마 불러오기

import json
from pyspark.sql.types import StructType

# Define the path to the JSON file
input_file_path = 'github_schema.json'

# Open and load the JSON file
with open(input_file_path, 'r') as json_file:
    github_schema = json.load(json_file)

In [None]:
# (선택1)저장한 스키마로 파일 읽기 (빠른 확인을 위해 일부 gh archive데이터만 사용)
from pyspark.sql.types import StructType

schema_to_read = StructType.fromJson(github_schema)
df = spark.read.schema(schema_to_read).json("../data/gh_archive/2024-08-24-*.json.gz")

24/09/01 14:30:13 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [None]:
# (선택2)저장한 스키마로 파일 읽기
from pyspark.sql.types import StructType

schema_to_read = StructType.fromJson(github_schema)
df = spark.read.schema(schema_to_read).json("../data/gh_archive/*.json.gz")

## 데이터 정제방법 고민하기

### 데이터 확인하기

* 전체 컬럼명을 보여주는 함수 작성 및 확인
* select문으로 해당 컬럼의 데이터 확인하기
* select문으로 해당 컬럼의 데이터 중 filter해서 확인하기

In [None]:
def find_all_names(data, current_path=[]):
    paths = []
    if isinstance(data, dict):
        if 'name' in data:
            # Collect the path to the current 'name'
            paths.append('.'.join(current_path + [data['name']]))
        # Recursively search within this dictionary
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                paths.extend(find_all_names(value, current_path + [data.get('name', key)]))
    elif isinstance(data, list):
        for item in data:
            paths.extend(find_all_names(item, current_path))
    return paths

list_all_column = []
for i, each_line in enumerate(find_all_names(github_schema)):
    print(each_line.replace('fields.',''))
    list_all_column.append(f"{each_line.replace('fields.','')}")
    if i == 11:
        break    

actor
actor.avatar_url
actor.display_login
actor.gravatar_id
actor.id
actor.login
actor.url
created_at
id
org
org.avatar_url
org.gravatar_id


In [None]:
# 데이터 확인 (컬럼의 전체데이터 distinct)
import pyspark.sql.functions as F

columns = ['type']
select_exprs = [F.col(col_path).alias(col_path) for col_path in columns]

df.select(*select_exprs).distinct().show(10,False)

                                                                                

+-----------------------------+
|type                         |
+-----------------------------+
|PullRequestReviewEvent       |
|PushEvent                    |
|GollumEvent                  |
|ReleaseEvent                 |
|CommitCommentEvent           |
|CreateEvent                  |
|PullRequestReviewCommentEvent|
|IssueCommentEvent            |
|DeleteEvent                  |
|IssuesEvent                  |
+-----------------------------+
only showing top 10 rows



In [None]:
# 데이터 확인 (컬럼의 전체데이터 distinct)
import pyspark.sql.functions as F

columns = ['payload.issue.assignee.login']
select_exprs = [F.col(col_path).alias(col_path) for col_path in columns]

df.select(*select_exprs).distinct().show(10,False)



+----------------------------+
|payload.issue.assignee.login|
+----------------------------+
|Vladimir563                 |
|Z3rio                       |
|albar965                    |
|mrbubbles-src               |
|0xSaksham                   |
|deepak1556                  |
|Artem-Ter                   |
|jonatfoodgroup              |
|GbCyber                     |
|IsaiahHarvi                 |
+----------------------------+
only showing top 10 rows




                                                                                

In [None]:
# 데이터 확인 (컬럼의 전체데이터 distinct)
import pyspark.sql.functions as F

columns = ['actor.login']
select_exprs = [F.col(col_path).alias(col_path) for col_path in columns]

df.select(*select_exprs).distinct().show(10,False)



+-----------------+
|actor.login      |
+-----------------+
|ItalloK          |
|youmomlmao       |
|InayatUllahKhan10|
|Zireael07        |
|smsrkursat       |
|BiancaDavey      |
|rr-weiyi-yu      |
|burstknight      |
|Shabirahmad1676  |
|Ritiky23         |
+-----------------+
only showing top 10 rows




                                                                                

In [None]:
# 데이터 확인 (컬럼의 filtered 데이터 distnct)
import pyspark.sql.functions as F

columns = ['payload.pull_request.base.repo.language']
select_exprs = [F.col(col_path).alias(col_path) for col_path in columns]

df.select(*select_exprs).distinct().filter(F.col('payload.pull_request.base.repo.language') == 'Python').show(10)



+---------------------------------------+
|payload.pull_request.base.repo.language|
+---------------------------------------+
|                                 Python|
+---------------------------------------+





                                                                                

In [None]:
# 데이터 확인 (컬럼의 전체데이터 distinct)
import pyspark.sql.functions as F

columns = ['repo.name','type']
select_exprs = [F.col(col_path).alias(col_path) for col_path in columns]

df.select(*select_exprs).distinct().filter(F.col('repo.name').contains('pytorch/pytorch')).show(10, False)

                                                                                

+-------------------------+-----------------------------+
|repo.name                |type                         |
+-------------------------+-----------------------------+
|pytorch/pytorch          |DeleteEvent                  |
|pytorch/pytorch          |PullRequestEvent             |
|pytorch/pytorch          |PullRequestReviewCommentEvent|
|pytorch/pytorch          |IssuesEvent                  |
|pytorch/pytorch          |ForkEvent                    |
|pytorch/pytorch          |WatchEvent                   |
|pytorch/pytorch.github.io|PullRequestEvent             |
|pytorch/pytorch          |CreateEvent                  |
|pytorch/pytorch          |IssueCommentEvent            |
|pytorch/pytorch          |PullRequestReviewEvent       |
+-------------------------+-----------------------------+
only showing top 10 rows





                                                                                

In [None]:
# 데이터 확인 (컬럼의 전체데이터 distinct)
import pyspark.sql.functions as F

columns = ['repo.name','type']
select_exprs = [F.col(col_path).alias(col_path) for col_path in columns]

df1 = df.select(*select_exprs).distinct().filter(F.col('repo.name').contains('pytorch/pytorch'))
df1.filter(F.col('type').contains('IssuesEvent')).show(10, False)

                                                                                

+---------------+-----------+
|repo.name      |type       |
+---------------+-----------+
|pytorch/pytorch|IssuesEvent|
+---------------+-----------+



### 산출할 데이터 모델결정 및 정제방법 고민

#### 기본과제 데이터 모델
1. Top 10 Repo
- id
- @timestamp
- repo_url
- repo_name
- push_count
- commit_count
- pr_count
- fork_count
- issue_count
- watch_count

2. Top 10 User
- id
- @timestamp
- user_name
- push_count
- commit_count
- pr_count
- issue_count
- issue_comment_count

3. Daily Stats
- id
- @timestamp
- distinct_user_cnt
- distinct_repo_cnt
- push_count
- commit_count
- pr_count
- issue_count
- issue_comment_count
- release_count

#### 추가 데이터 모델 (가능하면 진행..)
4. pytorch 레포지토리에서 가장 Issue를 활발히 진행한 Top 10 User
- repo.name
  - repo_name : 에서 가져온 ID/레포이름 형식 데이터가 pytorch/pytorch로 일치하는 조건만 filter
- actor.login
  - is_bot : 값이 github-actions[bot]인 경우 1 (bot 제거용)
  - user_made_issue : 유저이름
- type
  - issue_count : 횟수 집계 (type=IssuesEvent)

* 추가 데이터 모델에 대한 스키마 작성해보기

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

repo_schema = StructType([
    StructField('name', StringType(), True) # repo.name
])

actor_schema = StructType([
    StructField('login', StringType(), True), # actor.login
])

type_schema = StructField('type', StringType(), True) # type

# payload.issue.assignee.login (로직확인 후 불필요해졌지만 다중 StructField참고용 샘플로 남겨둠)
payload_schema = StructType([
    StructField('issue',  StructType([StructField('assignee', StructType([StructField('login',StringType(), True)]))]))])