In [None]:
from datetime import datetime, timedelta
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import hour, mean
import pyspark
sc = SparkContext.getOrCreate()

In [98]:
spark = pyspark.sql.SparkSession.builder.appName("auctions").getOrCreate()
df_auctions = spark.read.csv('auctions.csv', header=True)
rdd_auctions = df_auctions.rdd
#AuctionTypeId, country, date, deviceId, platform, refType, sourceId

In [99]:
spark = pyspark.sql.SparkSession.builder.appName("events").getOrCreate()
df_events = spark.read.csv('events.csv', header=True)
rdd_events = df_events.rdd
#date, eventId, refType, refHash, applicationId, attributed, deviceCountryCode, deviceOs, deviceBrand, deviceModel, deviceCity, sessionUserAgent, transId, userAgent, eventUuid, carrier, kind, deviceOs, wifi, connectionType, ipAddress, deviceLanguage

In [100]:
spark = pyspark.sql.SparkSession.builder.appName("installs").getOrCreate()
df_installs = spark.read.csv('installs.csv', header=True)
rdd_installs = df_installs.rdd
#created, applicationId, refType, refHash, clickHash, attributed, implicit, deviceCountryCode, deviceBrand, deviceModel, sessionUserAgent, userAgent, eventUuid, kind, wifi, transId, ipAddress, deviceLanguage

In [None]:
rdd_auctions = rdd_auctions.map(lambda x: (x.device_id, datetime.strptime(x.date, "%Y-%m-%d %H:%M:%S.%f")))

In [102]:
#ME QUEDO solo con los 3 primeros dias para uqe la distancia maxima sea de 3 dias
#arranca el 2019-04-18 00:00:00 asi que el date limite es 2019-04-20 23:59:59
limit_date_train = datetime(2019, 4, 20, 23, 59, 59, 999)
#uso los siguientes 3 dias para testear
limit_date_test_begin = datetime(2019, 4, 21, 0, 0, 0, 0)
limit_date_test_end = datetime(2019, 4, 23, 23, 59, 59, 999)

In [103]:
#filtro por los que son los primeros 3 dias para el train
rdd_train = rdd_auctions.filter(lambda x: x[1] < limit_date_train)
rdd_test = rdd_auctions.filter(lambda x: x[1] > limit_date_test_begin and x[1] < limit_date_test_end)
rdd_test.take(5)

[('2564673204772915246', datetime.datetime(2019, 4, 23, 18, 58, 0, 842116)),
 ('4441121667607578179', datetime.datetime(2019, 4, 23, 18, 58, 1, 530771)),
 ('7721769811471055264', datetime.datetime(2019, 4, 23, 18, 58, 1, 767562)),
 ('6416039086842158968', datetime.datetime(2019, 4, 23, 18, 58, 2, 363468)),
 ('1258642015983312729', datetime.datetime(2019, 4, 23, 18, 58, 2, 397559))]

In [104]:
#ahora me va a quedar para cada ID la lista de todas las apariciones ordenadas por tiempo
rdd_train = rdd_train.groupByKey().mapValues(list).mapValues(sorted)
rdd_test = rdd_test.groupByKey().mapValues(list).mapValues(sorted)

In [105]:
def get_all_reappearances(dateList):
    """
    esta funcion recibe una lista de fechas ordenada de las apariciones de un ID
    la idea es devolver una lista de tuplas, en la que cada tupla sea asi:
    valor 1 = fecha en que aparecio en un auction
    valor 2 = cuantas veces apareció antes
    valor 3 = cuanto tardo en volver a aparecer
    """
    distancias = []
    longitud_actual = len(dateList)
    if(longitud_actual > 1):
        for x in range(longitud_actual):
            if(x + 1 < longitud_actual):
                distancias.append([dateList[x], x,
                                   (dateList[x + 1] - dateList[x]).total_seconds()
                                  ])
        
    return distancias

In [106]:
#ahora me queda cada registro asi (id, [fecha aparicion, cuantas veces aparecio antes, tiempo que luego tardo en volver a aparecer])
rdd_train = rdd_train.flatMapValues(get_all_reappearances)
rdd_test = rdd_test.flatMapValues(get_all_reappearances)

In [107]:
def toCSVLine(data):
  return ','.join(str(d) for d in data)

In [108]:
rdd_test.take(1)[0][1][0].hour

20

In [112]:
lines_train = rdd_train.map(lambda x: (x[0], x[1][0].hour > 5, x[1][0].hour > 22, x[1][0].hour > 16,x[1][0].hour, x[1][1], x[1][2])).map(toCSVLine)
lines_train.repartition(1).saveAsTextFile('data_train.csv')

In [110]:
lines_test = rdd_test.map(lambda x: (x[0], x[1][0].hour > 5, x[1][0].hour > 22, x[1][0].hour > 16,x[1][0].hour, x[1][1], x[1][2])).map(toCSVLine)
lines_test.repartition(1).saveAsTextFile('data_test.csv')

In [111]:
#clf = RandomForestRegressor()
#clf.fit(X_train, Y_train)  
#y_pred = clf.predict(X_test)
rdd_test.take(5)

[('2932617030932207332',
  [datetime.datetime(2019, 4, 21, 20, 52, 34, 600491), 0, 5.683044]),
 ('2932617030932207332',
  [datetime.datetime(2019, 4, 21, 20, 52, 40, 283535), 1, 104871.617357]),
 ('2932617030932207332',
  [datetime.datetime(2019, 4, 23, 2, 0, 31, 900892), 2, 87.786292]),
 ('2932617030932207332',
  [datetime.datetime(2019, 4, 23, 2, 1, 59, 687184), 3, 220.280367]),
 ('2932617030932207332',
  [datetime.datetime(2019, 4, 23, 2, 5, 39, 967551), 4, 130.331401])]