In [1]:
import json
import yaml
import re
import os

from glob import glob

In [2]:
import pandas as pd

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('dtc-capstone') \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/26 21:54:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
import os
from google.cloud import storage
from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator

def upload_file_to_gcs(bucket, object_name, local_file):
    """
    Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
    :param bucket: GCS bucket name
    :param object_name: target path & file-name
    :param local_file: source path & file-name
    :return:
    """
    # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
    # (Ref: https://github.com/googleapis/python-storage/issues/74)
    storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
    storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
    # End of Workaround

    client = storage.Client()
    bucket = client.bucket(bucket)

    blob = bucket.blob(object_name)
    blob.upload_from_filename(local_file)

def upload_directory_to_gcs(bucket_name, gcs_path, local_path):
    """
    Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
    :param bucket_name: GCS bucket name
    :param gcs_path: target path & file-name
    :param local_file: source path & directory-name
    :return:
    """
    # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
    # (Ref: https://github.com/googleapis/python-storage/issues/74)
    storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
    storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
    # End of Workaround
    
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    assert os.path.isdir(local_path)  # Works only with local_paths that are directory
    for local_file in glob(local_path + "/**", recursive=True):
        if os.path.isfile(local_file):
            remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
            blob = bucket.blob(remote_path)
            blob.upload_from_filename(local_file)


  if StrictVersion(sqlite3.sqlite_version) < StrictVersion(min_sqlite_version):


In [5]:
## Source Constants
DATA_SOURCE_ROOT = "../../dtc-capstone-data/slack-data" 
COURSE_CHANNEL = "course-data-engineering"
# TODO: maybe more channel like course-channel, ml-zoomcamp, etc.
WELCOME_CHANNEL = "welcome"
USERS_DATA = "users.json"
# book-of-the-week
# announcements-course-data-engineering
# shameless-content

In [6]:
## Target Constants
PROJECT_ID = os.environ.get("GCP_PROJECT_ID", 'dtc-capstone-344019')
BUCKET = os.environ.get("GCP_GCS_BUCKET", 'dtc_capstone_data-lake')

## Users

#### Read And Uplaod Raw Data

In [10]:
users = spark.read.json(f'{DATA_SOURCE_ROOT}/{USERS_DATA}') 

22/03/22 17:27:33 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


#### Cleanup Unnecessary Columns

In [11]:
columns_to_drop = ['profile', 'color', 'who_can_share_contact_card', 'team_id', '_corrupt_record']
users = users.drop(*columns_to_drop)

In [12]:
schema = types.StructType([
    types.StructField("deleted",types.BooleanType(),True),
    types.StructField("id",types.StringType(),True),
    types.StructField("is_admin",types.BooleanType(),True),
    types.StructField("is_app_user",types.BooleanType(),True),
    types.StructField("is_bot",types.BooleanType(),True),
    types.StructField("is_email_confirmed",types.BooleanType(),True),
    types.StructField("is_invited_user",types.BooleanType(),True),
    types.StructField("is_owner",types.BooleanType(),True),
    types.StructField("is_primary_owner",types.BooleanType(),True),
    types.StructField("is_restricted",types.BooleanType(),True),
    types.StructField("is_ultra_restricted",types.BooleanType(),True),
    types.StructField("name",types.StringType(),True),
    types.StructField("real_name",types.StringType(),True),
    types.StructField("tz",types.StringType(),True),
    types.StructField("tz_label",types.StringType(),True),
    types.StructField("tz_offset",types.IntegerType(),True),
    types.StructField("updated",types.LongType(),True)
    ]
)

#### Create Sub-Categories for User

In [14]:
def format_user_data(d):
    identity = ["name", "real_name"]
    location = ["tz","tz_label", "tz_offset"]
    status = ["deleted", "is_admin", "is_owner", "is_primary_owner",
              "is_restricted","is_ultra_restricted","is_bot", 
              "is_email_confirmed"]
    
    user = {}
    for key, value in {"identifiers": identity, "location":location, "status":status}.items():
        user[key]={}
        user[key]["id"] = d["id"]
        for v in value:
            if v in d:
                user[key][v]=d[v]
            else:
                user[key][v]="None"   
                
    return user

In [None]:
3. Upload Clean Data ..in 4 different versions.

In [None]:
4.Create 4 External Table from the Buckets and changed files

In [None]:
5.Partition Data and create new tables

In [None]:
# Partition Data
df.repartition(24) \
    .write.parquet('data/pq/fhvhv/', mode="overwrite")

## Course Data Engineering

In [7]:
# A0. Find all related files 
files = sorted(glob(f'{DATA_SOURCE_ROOT}/{COURSE_CHANNEL}/*.json'))

# A0. Analyse start and end of the data set
start, end = files[0].split("/")[-1].split(".")[0], files[-1].split("/")[-1].split(".")[0]
(start,end)

('2020-11-22', '2022-03-14')

In [7]:
# 1. Load the data into pyspark dataframe
messages_cde = spark.read.json(f'{DATA_SOURCE_ROOT}/{COURSE_CHANNEL}/*.json', multiLine=True) 
# messages_cde.printSchema()
print((messages_cde.count(), len(messages_cde.columns)))

22/03/26 21:55:16 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


(9136, 32)


In [8]:
# Write on Directory Structure
# messages_cde.write.option("header", "true").parquet(f"messages_raw.parquet") 

22/03/26 21:55:26 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
22/03/26 21:55:26 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
22/03/26 21:55:26 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
22/03/26 21:55:26 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
22/03/26 21:55:26 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
22/03/26 21:55:27 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
22/03/26 21:55:27 WARN MemoryManager: Total allocation exceeds 95,

In [17]:
interest_columns = [
    "client_msg_id",
    "parent_user_id",
    "reply_count",
    "subtype",
    "text",
    "thread_ts",
    "ts",
    "user",
]
messages_cde = messages_cde.select(interest_columns)

In [18]:
x=messages_cde.toPandas()#.to_csv(header=True, index=False)

Unnamed: 0,client_msg_id,parent_user_id,reply_count,subtype,text,thread_ts,ts,user
0,9e77c6e5-e288-4060-a6e6-3ec9185345c2,U02TMP4GJEM,,,"Hi <@U01AXE0P5M3>, I had a lot of problems whe...",1642995076.257900,1643012748.273300,U02TMP4GJEM
1,3bda75c8-bdf7-4794-aacd-530d430a75d8,U02B8U0QZEK,,,i dont even know where the 1:latest come from,1642984070.239900,1643012833.273500,U02B8U0QZEK
2,e4912f0a-1d64-4699-915b-8ef2cc9413ce,,,thread_broadcast,okay so terraform is still disturbing me ...wo...,1642422345.167200,1643012974.274000,U02RTJPV6TZ
3,e9dad671-17e3-4c8b-acf3-380573fab37a,,,thread_broadcast,google_bigquery_dataset.dataset: Creating...\n...,1642422345.167200,1643013000.274300,U02RTJPV6TZ
4,acb4f60a-26e0-4900-9c06-3a77df329487,U02T9550LTU,,,Unique submissions? (i.e. different emails),1642999704.262500,1643013190.274800,U02U809EAE7
...,...,...,...,...,...,...,...,...
9131,,,,channel_join,<@U01ENFKMTU5> has joined the channel,,1607784533.011300,U01ENFKMTU5
9132,,,,channel_join,<@U01GL9D4H7Z> has joined the channel,,1608114154.006500,U01GL9D4H7Z
9133,,,,channel_join,<@U01H4CB36HK> has joined the channel,,1608591242.019100,U01H4CB36HK
9134,,,,channel_join,<@U01HXRYSD09> has joined the channel,,1608776149.019700,U01HXRYSD09




In [57]:
# A1. Check all columns
sorted(all_messages.columns)

In [116]:
messages_cde.select('subtype').distinct().collect()

[Row(subtype='tombstone'),
 Row(subtype=None),
 Row(subtype='thread_broadcast'),
 Row(subtype='channel_join'),
 Row(subtype='channel_topic')]

In [118]:
messages_cde.select('type').distinct().collect()

[Row(type='message')]

In [117]:
messages_cde.select('topic').distinct().collect()

[Row(topic=None),
 Row(topic='<https://docs.google.com/document/d/1TI1co3aRcYYBB9keAMPgrnQd84juN-i8bZiiXW_1oUo/edit?usp=sharing>'),
 Row(topic='<https://github.com/DataTalksClub/data-engineering-zoomcamp>')]

In [469]:
from pyspark.sql.functions import col

# 3. Clean Unwanted subtypes
messages_cde = messages_cde.where((col("subtype").isNull()) | ((col("subtype") != "thread_broadcast") & (col("subtype") != "channel_join")))

In [125]:
# 4. Extract reactions into new dataframe

from pyspark.sql.functions import explode

reactions_df = messages_cde.where((col("reactions").isNotNull())) 
reactions_df = reactions_df.select(['client_msg_id','reactions'])
# reactions_df.select(['client_msg_id','reactions']).distinct().collect()
reactions_df = reactions_df.withColumn('reaction', explode('reactions')).drop("reactions")
# reactions_df.select(['client_msg_id','reaction']).head()
reactions_df = reactions_df.select(['client_msg_id',"reaction.name", "reaction.count", "reaction.users"])
reactions_df.select(['client_msg_id',"name", "count", "users"]).show()

+--------------------+---------------+-----+--------------------+
|       client_msg_id|           name|count|               users|
+--------------------+---------------+-----+--------------------+
|d751f16a-ca2a-403...|         scream|    2|[U01AXE0P5M3, U02...|
|780394d7-0647-499...|   raised_hands|    1|       [U02V24WAZRN]|
|425a4e1f-fd43-496...|   raised_hands|    1|       [U02U8FXSG1Y]|
|aa607d5d-55fe-45f...|             +1|    1|       [U02U34YJ8C8]|
|a72b3aa0-db7b-488...|             +1|    3|[U02U34YJ8C8, U02...|
|1ade0883-ba49-4d3...|   raised_hands|    1|       [U0308865C0H]|
|1ade0883-ba49-4d3...|             +1|    1|       [U0308865C0H]|
|29f12de9-4834-405...|             +1|    1|       [U02UY1QTGHW]|
|53f8696b-acd2-476...|             +1|    1|       [U02HFP7UTFB]|
|3a7e75e9-1603-4b5...|   raised_hands|    1|       [U02QBJYQFK9]|
|071697F4-5AA5-417...|             +1|    1|       [U02RHT0M3M5]|
|0F73517E-6509-47F...|             +1|    1|       [U0308MF3KUH]|
|760094a6-

In [126]:
# A2. Unique Reactions..
unique_reactions = reactions_df.select('name').distinct().rdd.flatMap(lambda x: x).collect()

In [483]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType

# 5 Clean Message from some unwanted characters
def clean_message_text(text):             
    user_pattern = re.compile(r'<@(.+?)>')
    link_pattern_text = re.compile(r'<(http.+?)\|(.+?)>')
    link_pattern = re.compile(r'<(http.+?)>')
    
    text = text.replace('\xa0', ' ').replace('•', '-').replace('\n\n', '\n').replace("'", '').replace("`", '')
    text = re.sub('\n', ' ', text) 
    text = user_pattern.sub("", text)
    text = link_pattern_text.sub("", text)
    text = link_pattern.sub("", text)
    return text

udf_clean_message_text = udf(lambda x:clean_message_text(x),StringType())

In [484]:
messages_cde = messages_cde.withColumn("text",udf_clean_message_text(col("text")))

In [486]:
messages_cde.select("text").head()

Row(text='hi , i had a lot of problems when trying to convert the .ipynb file to a py script. i copied exactly the code in the video and i kept getting the error no template sub-directory with name script can be found. after a bit of googling, i found that if i uninstalled nbconvert module and then reinstalled it with a lower version, the command would work. i was close to throwing my laptop out of the window by that point! obviously we all have different computers, os and python versions, so mentioning the module version would be very helpful!')

In [499]:
# 6 Convert epoch to human readable datetime
from pyspark.sql.functions import *



In [527]:
from pyspark.sql.types import IntegerType, DateType
df = messages_cde.withColumn("ts", messages_cde["ts"].cast(IntegerType()))
df = df.withColumn("ts",udf_epoch_2_datetime(col("ts")))
df=df.withColumn("ts",to_timestamp(col("ts")))

df.select(["ts"]).head(10)

                                                                                

[Row(ts=datetime.datetime(2022, 1, 24, 9, 25, 48)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 27, 13)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 29, 34)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 30)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 33, 10)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 34, 32)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 38, 52)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 42, 27)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 45, 13)),
 Row(ts=datetime.datetime(2022, 1, 24, 9, 47, 25))]

In [491]:
def epoch_2_datetime(epoch):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch))

udf_epoch_2_datetime = udf(lambda x: epoch_2_datetime(x),StringType())

In [498]:
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime("1643012748.273300"))

TypeError: an integer is required (got type str)

In [439]:
epoch_2_datetime(None)

'2022-03-22 23:00:06'

In [489]:
# x=messages_cde.withColumn("ts_timestamp", (F.col("ts")/1000).cast("timestamp"))
# x.select(["ts", "ts_timestamp"]).show()
from pyspark.sql.functions import *

messages_cde = messages_cde.withColumn("x",udf_epoch_2_datetime(col("thread_ts")))
# messages_cde = messages_cde.withColumn("ts_",udf_epoch_2_datetime(col("ts")))

In [497]:
messages_cde.select(['ts']).printSchema()

root
 |-- ts: string (nullable = true)



In [None]:
from pyspark.sql import functions as f
from pyspark.sql import types as t
messages_cde = messages_cde.withColumn('ts', f.date_format(df.epoch.cast(dataType=t.TimestampType()), "yyyy-MM-dd"))
messages_cde = messages_cde.withColumn('thread_ts', f.date_format(df.epoch.cast(dataType=t.TimestampType()), "yyyy-MM-dd"))

In [391]:
# 6. Split Root messages and Threaded messages
root_messages = messages_cde.where((col("parent_user_id").isNull()))
thread_replies = messages_cde.where((col("parent_user_id").isNotNull()))

print(f"All_messages  :{(messages_cde.count(), len(messages_cde.columns))}")
print(f"Root_messages :{(root_messages.count(), len(root_messages.columns))}")
print(f"Thread Replies:{(thread_replies.count(), len(thread_replies.columns))}")

All_messages  :(9087, 10)
Root_messages :(1862, 10)
Thread Replies:(7225, 10)


In [128]:
# A3.1 see which columns are all empty
from pyspark.sql import functions as F

def columns_not_in_use(df):
    nonNull_cols = [c for c in df.columns if df.filter(F.col(c).isNotNull()).count() > 0]
    null_cols = [c for c in df.columns if df.filter(F.col(c).isNotNull()).count() == 0]
    return (nonNull_cols, null_cols)

In [130]:
drop_columns_root_messages = columns_not_in_use(root_messages)[1]
drop_columns_thread_replies = columns_not_in_use(thread_replies)[1]
print(drop_columns_root_messages)
print("-----")
print(drop_columns_thread_replies)

['inviter', 'parent_user_id', 'root', 'x_files']
-----
['inviter', 'root', 'topic']


In [131]:
# A3.2 Define interested columns based on A1.analysis

interest_columns= ["client_msg_id", "parent_user_id", "reply_count", "subtype", "text",
                   "thread_ts","ts", "user"]

drop_columns = ["attachments","blocks", "display_as_bot", "edited", "files", "hidden", "inviter", "is_locked", 
                "last_read","latest_reply", "reactions", "replies", "reply_count", "reply_users",
                "reply_users_count", "root" "source_team", "subscribed", "team", "topic" , "type",
                "upload", "user_profile", "user_team", "x_files"]

len(interest_columns + drop_columns) # 30 
len(messages_cde.columns) #32

32

In [132]:
# 6. Drop unused columns  
messages_cde=messages_cde.drop(*drop_columns)
root_messages=root_messages.drop(*drop_columns)
thread_replies=thread_replies.drop(*drop_columns)

In [None]:
# Some text cleanup practice...

In [None]:
## Approach 1 Alexey ...
# things to clean up, code-blocks
# punctuations
# \n 

In [385]:
def clean_message_text(text): 
    # clean message text from usernames, links, etc
            
    user_pattern = re.compile(r'<@(.+?)>')
    link_pattern_text = re.compile(r'<(http.+?)\|(.+?)>')
    link_pattern = re.compile(r'<(http.+?)>')
    
    text = text.replace('\xa0', ' ').replace('•', '-').replace('\n\n', '\n').replace("'", '').replace("`", '')
    text = re.sub('\n', ' ', text) 
    text = user_pattern.sub("", text)
    text = link_pattern_text.sub("", text)
    text = link_pattern.sub("", text)    
    return text.strip()

In [386]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType
udf_clean_message_text = udf(lambda x:clean_message_text(x),StringType())

In [387]:
root_messages_cleaned1 = root_messages.withColumn("text",udf_clean_message_text(col("text")))
thread_replies_cleaned1 = thread_replies.withColumn("text",udf_clean_message_text(col("text")))

In [388]:
root_messages_cleaned1.select("text").head(10)

Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


[Row(text='Also , do you have to submit homework for all the 6 weeks to be eligible for the project stage ?'),
 Row(text='i am getting some error in SQL query tool, i am tring to find a specific location in the "zones" data set but i keep getting the ERROR:  column "borough" does not exist LINE 7: Borough=Queens heres an example that i tried SELECT  * FROM zones WHERE Borough=Queens'),
 Row(text='Hey all, I want to ask something about home work. So, besides yellow_taxi_data we need to import dataset about zones right. Do I need to create python script to ingest this dataset to the database or can I just import it using sql script or using import feature in pgadmin? Thanks:raised_hands:'),
 Row(text='Week 1: seems to me that the README.md is a little confusing, if i were to follow the steps documented there, as some steps are duplicated? understand there are multiple ways to achieve the same task, but perhaps they can be marked as alternatives?'),
 Row(text='Im having this error while t

In [None]:
## Approach 2 nltk

In [377]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()


def clean_message_text2(sentence):        
    sentence=str(sentence)
    sentence = sentence.lower()

    sentence = sentence.replace('\xa0', ' ').replace('•', '-').replace('\n\n', '\n')
    sentence = sentence.replace("'", ' ').replace("`", ' ')
    sentence = sentence.replace('{html}',"")
    
    sentence = re.sub('\n', '', sentence) 
    sentence = re.sub(r'http\S+', '',sentence) # remove url
    sentence = re.sub('[0-9]+', '', sentence) # 
    
    return sentence.strip()

def tokenize_sentence(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)  
    # Remove filterwords and remove the worlds that lenght less than 2
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    
    stemmer = PorterStemmer() 
    stem_words=[stemmer.stem(w) for w in filtered_words] # set root form of the word
    lemmatizer = WordNetLemmatizer()
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iremertuerk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/iremertuerk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/iremertuerk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [378]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType

udf_clean_message_text2 = udf(lambda x:clean_message_text2(x),StringType())

In [379]:
root_messages_cleaned2 = root_messages.withColumn("text",udf_clean_message_text2(col("text")))
thread_replies_cleaned2 = thread_replies.withColumn("text",udf_clean_message_text2(col("text")))

In [380]:
root_messages_cleaned2.select("text").head(10)

Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


[Row(text='also <@uaxepm>, do you have to submit homework for all the  weeks to be eligible for the project stage ?'),
 Row(text='i am getting some error in sql query tool, i am tring to find a specific location in the "zones" data set but i keep getting the error:  column "borough" does not exist  line : borough= queens  here s an example that i tried   select *fromzoneswhereborough= queens'),
 Row(text='hey all, i want to ask something about home work. so, besides  yellow_taxi_data  we need to import dataset about zones right. do i need to create python script to ingest this dataset to the database or can i just import it using sql script or using import feature in pgadmin? thanks:raised_hands:'),
 Row(text='week : seems to me that the readme.md is a little confusing, if i were to follow the steps documented there, as some steps are duplicated?understand there are multiple ways to achieve the same task, but perhaps they can be marked as alternatives?'),
 Row(text='i m having this err

In [None]:
## Approach 3 

In [267]:
import html
from bs4 import BeautifulSoup
import unicodedata

import re, string # Regular Expressions, String
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
# soup = BeautifulSoup(html_doc, 'html.parser')
# set of stopwords to be removed from text
stop = set(stopwords.words('english'))

# update stopwords to have punctuation too
stop.update(list(string.punctuation))

def clean_message_text3(text_list):
    
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text_list.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # strip html
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("@", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # remove stopwords from text
    final_text = []
    for word in text.split():
        if word.strip().lower() not in stop:
            final_text.append(word.strip().lower())
    
    text = " ".join(final_text)
    
    # lemmatize words
    lemmatizer = WordNetLemmatizer()    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    
    return text.lower()

In [268]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType
udf_clean_message_text3 = udf(lambda x:clean_message_text3(x),StringType())

In [269]:
root_messages_cleaned3 = root_messages.withColumn("text",udf_clean_message_text3(col("text")))
thread_replies_cleaned3 = thread_replies.withColumn("text",udf_clean_message_text3(col("text")))

In [195]:
root_messages_cleaned3.select("text").head(10)

                                                                                

[Row(text='also uaxepm submit homework week eligible project stage'),
 Row(text='get error sql query tool tring find specific location zone data set keep get theerror column borough existline boroughqueensheres example triedselect fromzoneswhereboroughqueens'),
 Row(text='hey want ask something home work besides yellowtaxidata need import dataset zone right need create python script ingest dataset database import use sql script use import feature pgadmin thanksraisedhands'),
 Row(text='week seem readmemd little confuse follow step document step duplicatedunderstand multiple way achieve task perhaps mark alternative'),
 Row(text='im error try run data ingestion scripttraceback recent call last file appingestdatapy line mainargs file appingestdatapy line main df nextdfiter file usrlocallibpythonsitepackagespandasioparsersreaderspy line next return selfgetchunk file usrlocallibpythonsitepackagespandasioparsersreaderspy line getchunk return selfreadnrowssize file usrlocallibpythonsitepacka

In [27]:
# Group 2:
def epoch_2_datetime(epoch):
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(epoch))


def clean_message_text(text):
    user_pattern = re.compile(r"<@(.+?)>")
    link_pattern_text = re.compile(r"<(http.+?)\|(.+?)>")
    link_pattern = re.compile(r"<(http.+?)>")

    text = (
        text.replace("\xa0", " ")
        .replace("•", "-")
        .replace("\n\n", "\n")
        .replace("'", "")
        .replace("`", "")
    )
    text = re.sub("\n", " ", text)
    text = user_pattern.sub("", text)
    text = link_pattern_text.sub("", text)
    text = link_pattern.sub("", text)
    return text.strip()


udf_epoch_2_datetime = udf(lambda x: epoch_2_datetime(x), types.StringType())
udf_clean_message_text = udf(lambda x: clean_message_text(x), types.StringType())


In [28]:
from pyspark.sql.functions import col, explode, to_timestamp, udf

def transform_message_data(spark_session: SparkSession, prefix):
    
    # transform 1: all_message_data for given year-month
    message_data = spark_session.read.json(f'{DATA_SOURCE_ROOT}/{COURSE_CHANNEL}/{prefix}-*.json', multiLine=True)
    
    # transform 2: clean unwanted subtype (thread_broadcast, channel_join)
    # this step cause, all values for 'inviter' and 'root' become null
    message_data = message_data.where((col("subtype").isNull()) | ((col("subtype") != "thread_broadcast") & (col("subtype") != "channel_join")))
    
    # transform 3: cleanup the text column in messages.
    # some logic should come here ..
    udf_clean_message_text = udf(lambda x:clean_message_text(x),types.StringType())
    message_data = message_data.withColumn("text",udf_clean_message_text(col("text")))
    
    message_data = message_data.select(*interest_columns)
    return message_data.toPandas()

#     # transform 4: split messages in : root_level and thread_level messages
#     root_level_messages = message_data.where((col("parent_user_id").isNull()))
#     thread_level_messages = message_data.where((col("parent_user_id").isNotNull()))

#     # transform 5:  drop unrelevant columns from root_level and thread_level messages
#     root_level_messages = root_level_messages.select(*interest_columns)
#     thread_level_messages = thread_level_messages.select(*interest_columns)
#     return root_level_messages,thread_level_messages 

In [29]:
x = transform_message_data(spark, "2020-11")

                                                                                

In [30]:
x

Unnamed: 0,client_msg_id,parent_user_id,reply_count,subtype,text,thread_ts,ts,user
0,c750d7d6-2e19-4604-ba5d-b9bcabaa1dfc,,,,Hello everyone! Lets talk about our idea to cr...,,1606078694.001300,U01AXE0P5M3
1,,,,channel_topic,set the channel topic:,,1606078707.001500,U01AXE0P5M3
2,9d6c86dc-3215-4a80-a03d-2a603ceddd7a,,,,I think we should figure out - who the course ...,,1606078852.004300,U01AXE0P5M3
3,457cc8d2-5ba1-4db3-b53f-484e89873f00,,,,Id say software engineers and data scientists/...,,1606078935.005900,U01AXE0P5M3
4,ecf66f14-456f-46fb-94be-5d8c7f1c284b,,1.0,,this is a good starting point for the list of ...,1606078995.006400,1606078995.006400,U01AXE0P5M3
...,...,...,...,...,...,...,...,...
71,37f10cab-6d45-4591-9a5e-5fd950cdfffe,U01AXE0P5M3,,,sounds reasonable!,1606238914.012900,1606292650.000100,U01F78474M9
72,a8caaafa-e9c2-45c9-974c-2e4b6290c5e6,,2.0,,what do you think guys about making a case stu...,1606292801.002700,1606292801.002700,U01F78474M9
73,630c454e-65e5-4873-9434-3913e0c34411,,,,Thats an awesome idea!,,1606295686.003300,U01AXE0P5M3
74,c89ede73-0b73-4a91-9c3c-02f2eedfa5d0,U01F78474M9,,,Very interesting! Like an RPG. Heres more to a...,1606292801.002700,1606304440.003700,U01DHB2HS3X


# dag logic

In [25]:
message_schema = types.StructType([
    types.StructField("client_msg_id",types.StringType(),False),
    types.StructField("parent_user_id",types.StringType(),True),
    types.StructField("text",types.StringType(),True), #-> 
    types.StructField("type",types.StringType(),True),
    types.StructField("subtype",types.StringType(),True),
    types.StructField("user",types.StringType(),True), #-> id and user fk.
    types.StructField("ts",types.StringType(),True), #-> epoch to human readable format
    types.StructField("thread_ts",types.StringType(),True),
    types.StructField("reply_count",types.IntegerType(),True),
    types.StructField("reactions",types.ArrayType(types.StructType([
        types.StructField("count",types.LongType(),True),
        types.StructField("name" ,types.StringType(),True),
        types.StructField("users" ,types.ArrayType(types.StringType(),True),True)
        ])
    ),True),
    ]
)

In [26]:
message_data = spark.read.schema(message_schema) \
        .json(f'{DATA_SOURCE_ROOT}/{COURSE_CHANNEL}/2020-11-*.json', multiLine=True)

In [96]:
# message_data = spark.read.json(f'{DATA_SOURCE_ROOT}/{COURSE_CHANNEL}/2020-11-*.json', multiLine=True)

In [27]:
message_data.printSchema()

root
 |-- client_msg_id: string (nullable = true)
 |-- parent_user_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- subtype: string (nullable = true)
 |-- user: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- thread_ts: string (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- reactions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- users: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)



In [24]:
message_data.head(10)

[Row(client_msg_id=None, parent_user_id=None, text=None, type=None, subtype=None, user=None, ts=None, thread_ts=None, reply_count=None, reactions=None),
 Row(client_msg_id=None, parent_user_id=None, text=None, type=None, subtype=None, user=None, ts=None, thread_ts=None, reply_count=None, reactions=None),
 Row(client_msg_id=None, parent_user_id=None, text=None, type=None, subtype=None, user=None, ts=None, thread_ts=None, reply_count=None, reactions=None),
 Row(client_msg_id=None, parent_user_id=None, text=None, type=None, subtype=None, user=None, ts=None, thread_ts=None, reply_count=None, reactions=None)]

In [28]:
from pyspark.sql.functions import explode,col
message_data = message_data.where((col("subtype").isNull()) | ((col("subtype") != "thread_broadcast") & (col("subtype") != "channel_join")))

In [69]:
from pyspark.sql.functions import col, explode, lit
message_data = message_data.withColumn("channel_name", lit(COURSE_CHANNEL))

In [72]:
def extract_reactions_data(df):
    reactions_df = df.where((col("reactions").isNotNull()))
    reactions_df = reactions_df.select(["client_msg_id", "user", "channel_name", "reactions"])
    reactions_df = reactions_df.withColumnRenamed("user", "msg_owner")

    reactions_df = reactions_df.withColumn("reaction", explode("reactions")).drop(
        "reactions"
    )
    reactions_df = reactions_df.select(
        ["client_msg_id", "msg_owner", "reaction.name", "reaction.count", "reaction.users", "channel_name"]
    )
    reactions_df = reactions_df.withColumn("msg_reactor", explode("users"))
    drop_cols = ["users", "count"]
    reactions_df = reactions_df.drop(*drop_cols)

    return reactions_df

In [47]:
# message_data.select("reactions").head(11)

In [73]:
reactions_df = extract_reactions_data(message_data)

In [74]:
reactions_df.columns

['client_msg_id', 'msg_owner', 'name', 'channel_name', 'msg_reactor']

In [75]:
reactions_df.show()

+--------------------+-----------+----------------+--------------------+-----------+
|       client_msg_id|  msg_owner|            name|        channel_name|msg_reactor|
+--------------------+-----------+----------------+--------------------+-----------+
|c750d7d6-2e19-460...|U01AXE0P5M3|            wave|course-data-engin...|U01DHB2HS3X|
|9d6c86dc-3215-4a8...|U01AXE0P5M3|              +1|course-data-engin...|U01F78474M9|
|9d6c86dc-3215-4a8...|U01AXE0P5M3|              +1|course-data-engin...|U01DHB2HS3X|
|93eba8e0-4df8-410...|U01AXE0P5M3|              +1|course-data-engin...|U01DHB2HS3X|
|28d13ad5-44e7-4b2...|U01AXE0P5M3|    raised_hands|course-data-engin...|U01TKRLE998|
|fdbfaa9f-aad5-499...|U01F78474M9|            eyes|course-data-engin...|U02THT8U1GC|
|f52ad9b4-c05d-43e...|U01F78474M9|              +1|course-data-engin...|U01DFQ82AK1|
|f52ad9b4-c05d-43e...|U01F78474M9|              +1|course-data-engin...|U01AXE0P5M3|
|f52ad9b4-c05d-43e...|U01F78474M9|              +1|course-data-en

In [56]:
reactions_df = reactions_df.withColumn("reaction", explode("reactions")).drop("reactions")
reactions_df.show()

+--------------------+-----------+--------------------+
|       client_msg_id|  msg_owner|            reaction|
+--------------------+-----------+--------------------+
|c750d7d6-2e19-460...|U01AXE0P5M3|{1, wave, [U01DHB...|
|9d6c86dc-3215-4a8...|U01AXE0P5M3|{2, +1, [U01F7847...|
|93eba8e0-4df8-410...|U01AXE0P5M3|{1, +1, [U01DHB2H...|
|28d13ad5-44e7-4b2...|U01AXE0P5M3|{1, raised_hands,...|
|fdbfaa9f-aad5-499...|U01F78474M9|{1, eyes, [U02THT...|
|f52ad9b4-c05d-43e...|U01F78474M9|{3, +1, [U01DFQ82...|
|54c86ff1-c3c2-40a...|U01AXE0P5M3|{1, white_check_m...|
|f64812b2-4ae1-470...|U01AXE0P5M3|{1, +1, [U01DHB2H...|
|8babafe8-64ed-4a5...|U01DFQ82AK1|{2, +1, [U01DHB2H...|
|e52af1da-9435-4ae...|U01AXE0P5M3|{1, pray, [U01DFQ...|
|a8caaafa-e9c2-45c...|U01F78474M9|{3, +1, [U01DFQ82...|
|e6943aa3-ee92-404...|U01F78474M9|{1, squirrel, [U0...|
+--------------------+-----------+--------------------+



+--------------------+-----------+--------------------+
|       client_msg_id|  msg_owner|            reaction|
+--------------------+-----------+--------------------+
|c750d7d6-2e19-460...|U01AXE0P5M3|{1, wave, [U01DHB...|
|9d6c86dc-3215-4a8...|U01AXE0P5M3|{2, +1, [U01F7847...|
|93eba8e0-4df8-410...|U01AXE0P5M3|{1, +1, [U01DHB2H...|
|28d13ad5-44e7-4b2...|U01AXE0P5M3|{1, raised_hands,...|
|fdbfaa9f-aad5-499...|U01F78474M9|{1, eyes, [U02THT...|
|f52ad9b4-c05d-43e...|U01F78474M9|{3, +1, [U01DFQ82...|
|54c86ff1-c3c2-40a...|U01AXE0P5M3|{1, white_check_m...|
|f64812b2-4ae1-470...|U01AXE0P5M3|{1, +1, [U01DHB2H...|
|8babafe8-64ed-4a5...|U01DFQ82AK1|{2, +1, [U01DHB2H...|
|e52af1da-9435-4ae...|U01AXE0P5M3|{1, pray, [U01DFQ...|
|a8caaafa-e9c2-45c...|U01F78474M9|{3, +1, [U01DFQ82...|
|e6943aa3-ee92-404...|U01F78474M9|{1, squirrel, [U0...|
+--------------------+-----------+--------------------+



In [16]:
message_data.head(3)

[Row(client_msg_id='222e70f4-84dd-4d1a-960c-c54f49ed5624', parent_user_id='U01L9SP7UBT', text='At some point! Now I hope at least to finish this course =)', type='message', subtype=None, user='U01AXE0P5M3', ts='1626288400.022700', thread_ts='1625048497.019000', reply_count=None, reactions=None)]

In [17]:
message_data = message_data.toPandas() 

In [18]:
message_data.head(3)

Unnamed: 0,client_msg_id,parent_user_id,text,type,subtype,user,ts,thread_ts,reply_count,reactions
0,222e70f4-84dd-4d1a-960c-c54f49ed5624,U01L9SP7UBT,At some point! Now I hope at least to finish t...,message,,U01AXE0P5M3,1626288400.0227,1625048497.019,,


In [19]:
message_data.info
message_data.dtypes

client_msg_id      object
parent_user_id     object
text               object
type               object
subtype            object
user               object
ts                 object
thread_ts          object
reply_count       float64
reactions          object
dtype: object

In [20]:
def clean_message_text(text):
    user_pattern = re.compile(r"<@(.+?)>")
    link_pattern_text = re.compile(r"<(http.+?)\|(.+?)>")
    link_pattern = re.compile(r"<(http.+?)>")

    text = (
        text.replace("\xa0", " ")
        .replace("•", "-")
        .replace("\n\n", "\n")
        .replace("'", "")
        .replace("`", "")
    )
    text = re.sub("\n", " ", text)
    text = user_pattern.sub("", text)
    text = link_pattern_text.sub("", text)
    text = link_pattern.sub("", text)
    return text.strip()

In [21]:
message_data['text'] = message_data['text'].apply(lambda x: clean_message_text(x))

In [115]:
# drop_cols = ['attachments', 'blocks', 'edited', 'inviter','is_locked', 'last_read', 'latest_reply', 'reactions', 
#              'replies', 'reply_users' , 'reply_users_count', 'root','source_team', 'subscribed', 
#              'team','topic','type', 'user_profile',  'user_team'
#             ]


# interest_col = ['client_msg_id', 'parent_user_id', 'reply_count', "subtype", "text", "thread_ts", "ts", "user" ]

# len(message_data.columns) # 27

# # len(drop_cols + interest_col) #29

27

In [117]:
# message_data= message_data[interest_col]

In [22]:
len(message_data.columns)

10

In [26]:
thread_replies = message_data[message_data.parent_user_id.notnull()]
root_messages = message_data[message_data.parent_user_id.isnull()]

In [27]:
thread_replies.head(3)

Unnamed: 0,client_msg_id,parent_user_id,text,type,subtype,user,ts,thread_ts,reply_count,reactions
0,222e70f4-84dd-4d1a-960c-c54f49ed5624,U01L9SP7UBT,At some point! Now I hope at least to finish t...,message,,U01AXE0P5M3,1626288400.0227,1625048497.019,,


In [28]:
import time
def epoch_2_datetime(epoch):
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(epoch))

In [29]:
thread_replies = thread_replies.astype({"ts": float , "thread_ts": float})

thread_replies['ts'] = thread_replies['ts'].apply(lambda x: epoch_2_datetime(x))
thread_replies['thread_ts'] = thread_replies['thread_ts'].apply(lambda x: epoch_2_datetime(x))

In [30]:
root_messages = root_messages.astype({"ts": float , "thread_ts": float})

root_messages['ts'] = root_messages['ts'].apply(lambda x: epoch_2_datetime(x))
# root_messages['thread_ts'] = root_messages['thread_ts'].apply(lambda x: epoch_2_datetime(x)) ## hmm thread_ts causing issue

In [31]:
thread_replies.head(10)

Unnamed: 0,client_msg_id,parent_user_id,text,type,subtype,user,ts,thread_ts,reply_count,reactions
0,222e70f4-84dd-4d1a-960c-c54f49ed5624,U01L9SP7UBT,At some point! Now I hope at least to finish t...,message,,U01AXE0P5M3,2021-07-14 20:46:40,2021-06-30 12:21:37,,


In [32]:
root_messages.head(10)

Unnamed: 0,client_msg_id,parent_user_id,text,type,subtype,user,ts,thread_ts,reply_count,reactions
