In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc

from datetime import datetime

In [2]:
files = glob.glob('/raid/recsys2021_pre/*')

In [3]:
files

['/raid/recsys2021_pre/part-00118.parquet',
 '/raid/recsys2021_pre/part-00101.parquet',
 '/raid/recsys2021_pre/part-00249.parquet',
 '/raid/recsys2021_pre/part-00158.parquet',
 '/raid/recsys2021_pre/part-00037.parquet',
 '/raid/recsys2021_pre/part-00239.parquet',
 '/raid/recsys2021_pre/part-00115.parquet',
 '/raid/recsys2021_pre/part-00100.parquet',
 '/raid/recsys2021_pre/part-00014.parquet',
 '/raid/recsys2021_pre/part-00144.parquet',
 '/raid/recsys2021_pre/part-00206.parquet',
 '/raid/recsys2021_pre/part-00201.parquet',
 '/raid/recsys2021_pre/part-00041.parquet',
 '/raid/recsys2021_pre/part-00055.parquet',
 '/raid/recsys2021_pre/part-00017.parquet',
 '/raid/recsys2021_pre/part-00221.parquet',
 '/raid/recsys2021_pre/part-00236.parquet',
 '/raid/recsys2021_pre/part-00085.parquet',
 '/raid/recsys2021_pre/part-00046.parquet',
 '/raid/recsys2021_pre/part-00204.parquet',
 '/raid/recsys2021_pre/part-00065.parquet',
 '/raid/recsys2021_pre/part-00096.parquet',
 '/raid/recsys2021_pre/part-0000

In [4]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [5]:
import dask as dask, dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import subprocess

In [6]:
cluster = LocalCUDACluster(local_directory='/raid/dask3/', device_memory_limit=0.5)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:36389  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 429.50 GB


In [7]:
ddf = dask_cudf.read_parquet(files, columns=['tweet_id', 'timestamp'])

In [8]:
meta = ('time', 'datetime64[s]')

def convert_timestamp(x):
    x = cudf.to_datetime(x, unit='s')
    return(x)

ddf['date'] = ddf['timestamp'].map_partitions(convert_timestamp, meta=meta)

In [9]:
VALID_DOW = '2021-02-18'
train = ddf[ddf['date']>=cudf.to_datetime(VALID_DOW)].reset_index(drop=True)

In [10]:
tweet_id = train['tweet_id'].drop_duplicates()

In [11]:
tweet_id.sample(0.1).reset_index()[['tweet_id']].to_parquet('/raid/valid_tweets.parquet')



In [12]:
client.close()

<bound method Client.close of <Client: 'tcp://127.0.0.1:36389' processes=8 threads=8, memory=429.50 GB>>

In [2]:
import pandas as pd
import numpy as np

import glob
import gc

from joblib import Parallel, delayed

from tqdm.notebook import tqdm

In [3]:
max_len = 48
dftweets_ids = pd.read_parquet('/raid/valid_tweets.parquet')

In [4]:
!rm -r /raid/recsys2021_pre_1_TE
!rm -r /raid/recsys2021_pre_2_TE
!rm -r /raid/recsys2021_pre_3_TE
!rm -r /raid/recsys2021_pre_3_valid_TE

!mkdir /raid/recsys2021_pre_1_TE
!mkdir /raid/recsys2021_pre_2_TE
!mkdir /raid/recsys2021_pre_3_TE
!mkdir /raid/recsys2021_pre_3_valid_TE

In [5]:
files = sorted(glob.glob('/raid/recsys2021_pre/*'))[150:]

In [6]:
TE_files = sorted(glob.glob('/raid/TE/*.parquet'))
TE_files

['/raid/TE/a_user_id.parquet',
 '/raid/TE/b_is_verified_tweet_type.parquet',
 '/raid/TE/b_user_id.parquet',
 '/raid/TE/b_user_id_a_user_id.parquet',
 '/raid/TE/b_user_id_tweet_type_language.parquet',
 '/raid/TE/domains_language_b_follows_a_tweet_type_media_a_is_verified.parquet',
 '/raid/TE/media_tweet_type_language.parquet',
 '/raid/TE/media_tweet_type_language_a_is_verified_b_is_verified_b_follows_a.parquet',
 '/raid/TE/tw_original_user0_tweet_type_language.parquet',
 '/raid/TE/tw_original_user1_tweet_type_language.parquet',
 '/raid/TE/tweet_type.parquet']

In [7]:
te_mapping =[]

In [8]:
TE_files = [
    '/raid/TE/a_user_id.parquet',
    '/raid/TE/b_is_verified_tweet_type.parquet',
    '/raid/TE/b_user_id.parquet',
    '/raid/TE/b_user_id_a_user_id.parquet',
    '/raid/TE/b_user_id_tweet_type_language.parquet',
    '/raid/TE/domains_language_b_follows_a_tweet_type_media_a_is_verified.parquet',
    '/raid/TE/media_tweet_type_language_a_is_verified_b_is_verified_b_follows_a.parquet',
    '/raid/TE/tw_original_user0_tweet_type_language.parquet'
]

In [9]:
for file in TE_files:
    te_mapping.append(pd.read_parquet(file))

In [10]:
gc.collect()

60

In [11]:
psmooth=20
means = {}
means['reply'] = 0.02846728456689906
means['like'] = 0.3968895210408169
means['retweet'] = 0.08769760903336701
means['retweet_comment'] = 0.006918407917391091

In [12]:
def add_TE(df):
    for i, df_tmp in enumerate(te_mapping):
        col = [x for x in df_tmp.columns if not('reply' in x or ('retweet' in x and 'tw_len_retweet' not in x) or '_retweet_comment' in x or 'like' in x)]
        col_rest = [x for x in df_tmp.columns if x not in col]
        df = df.merge(df_tmp, on=col, how='left')
        for key in means.keys():
            if df_tmp.shape[0]>1000:
                df['TE_' + '_'.join(col) + '_' + key] = (((df[key + '_sum']-df[key]*df['is_train'])+means[key]*psmooth)/(df['reply_count']-df['is_train']+psmooth))
            else:
                df['TE_' + '_'.join(col) + '_' + key] = (((df[key + '_sum'])+means[key]*psmooth)/(df['reply_count']+psmooth))
            if col[0]=='a_user_id' and key=='like':
                df.loc[df['reply_count']<=1000, 'TE_' + '_'.join(col) + '_' + key] = None
            df['TE_' + '_'.join(col) + '_' + key] = df['TE_' + '_'.join(col) + '_' + key].fillna(means[key]).round(3)
        df.drop(col_rest, inplace=True, axis=1)
        gc.collect()
    return(df)

In [13]:
def split_week(fn):
    df = pd.read_parquet(fn)
    df2 = pd.read_parquet('/raid/recsys2021_token/' + fn.split('/')[-1])
    dftweets = df2['text_tokens'].str.split('\t').apply(lambda x: [int(y) for y in x[:max_len]] + [0]*max(0,(max_len-len(x))))
    df['text_tokens'] = dftweets.copy()
    #df = pd.concat([df, dftweets], axis=1)
    df['is_train'] = 1
    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
    df['dt_dow']  = df['date'].dt.weekday
    df['dt_hour'] = df['date'].dt.hour
    df['dt_minute'] = df['date'].dt.minute
    df['dt_second'] = df['date'].dt.second
    df = add_TE(df)
    dftmp = df[df['date']<pd.to_datetime('2021-02-11')]
    dftmp.drop('date', inplace=True, axis=1)
    dftmp.to_parquet('/raid/recsys2021_pre_1_TE/' + fn.split('/')[-1])
    dftmp = df[(df['date']>=pd.to_datetime('2021-02-11'))&(df['date']<pd.to_datetime('2021-02-18'))]
    dftmp.drop('date', inplace=True, axis=1)
    dftmp.to_parquet('/raid/recsys2021_pre_2_TE/' + fn.split('/')[-1])
    dftmp = df[df['date']>=pd.to_datetime('2021-02-18')]
    dftmp.drop('date', inplace=True, axis=1)
    dftmp[~(dftmp['tweet_id'].isin(dftweets_ids['tweet_id']))].to_parquet('/raid/recsys2021_pre_3_TE/' + fn.split('/')[-1])
    dftmp2 = dftmp[(dftmp['tweet_id'].isin(dftweets_ids['tweet_id']))].copy()
    dftmp2['is_train'] = 0
    dftmp2 = add_TE(dftmp2)
    dftmp2.to_parquet('/raid/recsys2021_pre_3_valid_TE/' + fn.split('/')[-1])
    del df, df2, dftmp2, dftweets, dftmp
    gc.collect()
    return

In [14]:
# %%time

# split_week(files[0])

In [15]:
%%time

res = Parallel(n_jobs=6,backend='multiprocessing')(delayed(split_week)(fn) for fn in tqdm(files))

  0%|          | 0/103 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


CPU times: user 561 ms, sys: 4.23 s, total: 4.79 s
Wall time: 2h 10min 4s
