In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
PYSPARK_PYTHON = "/opt/anaconda3/envs/rec_sys/bin/python"

# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from offline import SparkSessionBase
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
import gc

In [2]:
class OriginArticleData(SparkSessionBase):


    SPARK_APP_NAME = "mergeArticle"
    SPARK_URL = "yarn"
    
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):
        self.spark = self._create_spark_session()

In [3]:
oa = OriginArticleData()

In [4]:
@udf("string" ,StringType)
def strip_n_udf(text):
    if text:
        processed_text = text.replace('\n', '').replace('\r', '')
    else:
        processed_text = "unknow"
    return processed_text

In [5]:
oa.spark.sql("use hbase_to_hive")

sql = """select m_id as article_id , context as content, platform as channel_name, title,our_tags as channel_tags ,descr as article_describe ,source, source_tags,time as article_time from news_info_update"""

basic_content = oa.spark.sql(sql)

In [6]:
basic_content.fillna("unknow")
basic_content = basic_content.withColumn('content', strip_n_udf(basic_content.content))
basic_content = basic_content.withColumn('article_describe', strip_n_udf(basic_content.article_describe))

In [7]:
sentence_df = basic_content.select("article_id","channel_name","article_describe", "channel_tags","title","content","source","source_tags", F.concat_ws(
                                             ",",
                                             basic_content.source_tags,
                                             basic_content.channel_name,
                                             basic_content.title,
                                             basic_content.content
                                           ).alias("sentence"), "article_time")

In [8]:
oa.spark.sql("use article")
sentence_df.write.insertInto("article_data")

In [9]:
oa.spark.sql("select article_id,channel_name,channel_tags,source_tags,article_time  from article_data limit 20").show()

+--------------------+------------+--------------------+-----------------------------+-------------------+
|          article_id|channel_name|        channel_tags|                  source_tags|       article_time|
+--------------------+------------+--------------------+-----------------------------+-------------------+
|00000982fda1948f4...|      投资界|[ { "id" : 7 , "n...|    [ "汽车" , "新能源" , ...|2018-04-23 18:53:00|
|000039bca765fca66...|      猎云网|[ { "id" : 22 , "...|  [ "罗永浩" , "锤子科技" ...|2019-04-28 08:46:03|
|0000b6f5182e097e8...|      砍柴网|[ { "id" : 7 , "n...|   [ "特斯拉" , "二手车业务"]|2018-11-20 15:18:00|
|0000bca75b0386b06...|      希鸥网|[ { "id" : 18 , "...|                         null|2018-04-08 14:58:00|
|0000f7b314fe7970d...|        36氪|[ { "id" : 19 , "...|                         null|2019-02-02 18:41:44|
|00019c1868a4ae670...|      投资界|[ { "id" : 14 , "...|  [ "雅居乐" , "消费升级" ...|2018-08-14 13:42:00|
|0001a643f8a623a0c...|      希鸥网|[ { "id" : 6 , "n...|                         nu

In [10]:
del sentence_df
del basic_content

In [11]:
gc.collect()

255

In [12]:
oa.spark.stop()

In [13]:
from datetime import datetime, timedelta

In [101]:
oa.spark.sql("use hbase_to_hive")
_yester = datetime.today().replace(hour=0, minute=0, second=0, microsecond=0)
start = datetime.strftime(_yester+timedelta(days=-1, hours=0, minutes=0), "%Y-%m-%d %H:%M:%S")
end = datetime.strftime(_yester, "%Y-%m-%d %H:%M:%S")



sql = """select m_id as article_id , context as content, platform as channel_name, title,our_tags as channel_tags ,descr as article_describe ,source, source_tags 
         from news_info_update 
         where time >= '{}' and time < '{}'""".format(start, end)

In [104]:
oa.spark.sql(sql).show()

+--------------------+-------------------------------------+------------+-------------------------------------+--------------------+-------------------------------------+--------------------+----------------------------+
|          article_id|                              content|channel_name|                                title|        channel_tags|                     article_describe|              source|                 source_tags|
+--------------------+-------------------------------------+------------+-------------------------------------+--------------------+-------------------------------------+--------------------+----------------------------+
|010e71e3fad28cd23...|   12月5日，华为在武汉光谷国际网球...|      IT之家|      华为何刚nova6系列发布会公布2...|[ { "id" : 14 , "...|                                 null|                null|                        null|
|0121d6f0a61861fe0...|社交领域一直是互联网公司竞争的一个...|      IT之家|   拥有微信、QQ 两大平台，腾讯为何...|[ { "id" : 6 , "n...|                                 null|      