In [None]:
from datetime import datetime, timedelta
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import hour, mean
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark
sc = SparkContext.getOrCreate()

In [None]:
spark = pyspark.sql.SparkSession.builder.appName("auctions").getOrCreate()
df_auctions = spark.read.csv('auctions.csv', header=True)
rdd_auctions = df_auctions.rdd
#AuctionTypeId, country, date, deviceId, platform, refType, sourceId

In [None]:
spark = pyspark.sql.SparkSession.builder.appName("events").getOrCreate()
df_events = spark.read.csv('events.csv', header=True)
rdd_events = df_events.rdd
#date, eventId, refType, refHash, applicationId, attributed, deviceCountryCode, deviceOs, deviceBrand, deviceModel, deviceCity, sessionUserAgent, transId, userAgent, eventUuid, carrier, kind, deviceOs, wifi, connectionType, ipAddress, deviceLanguage

In [None]:
spark = pyspark.sql.SparkSession.builder.appName("installs").getOrCreate()
df_installs = spark.read.csv('installs.csv', header=True)
rdd_installs = df_installs.rdd
#created, applicationId, refType, refHash, clickHash, attributed, implicit, deviceCountryCode, deviceBrand, deviceModel, sessionUserAgent, userAgent, eventUuid, kind, wifi, transId, ipAddress, deviceLanguage

In [None]:
rdd_auctions = rdd_auctions.map(lambda x: (x.device_id, datetime.strptime(x.date, "%Y-%m-%d %H:%M:%S.%f")))

In [None]:
#ME QUEDO solo con los 3 primeros dias para uqe la distancia maxima sea de 3 dias
#arranca el 2019-04-18 00:00:00 asi que el date limite es 2019-04-20 23:59:59
limit_date_train = datetime(2019, 4, 20, 23, 59, 59, 999)
#uso los siguientes 3 dias para testear
limit_date_test_begin = datetime(2019, 4, 21, 0, 0, 0, 0)
limit_date_test_end = datetime(2019, 4, 23, 23, 59, 59, 999)

In [None]:
#filtro por los que son los primeros 3 dias para el train
rdd_train = rdd_auctions.filter(lambda x: x[1] < limit_date_train)
rdd_test = rdd_auctions.filter(lambda x: x[1] > limit_date_test_begin and x[1] < limit_date_test_end)

In [None]:
#ahora me va a quedar para cada ID la lista de todas las apariciones ordenadas por tiempo
rdd_train = rdd_train.groupByKey().mapValues(list).mapValues(sorted)
rdd_test = rdd_test.groupByKey().mapValues(list).mapValues(sorted)

In [None]:
def get_all_reappearances(dateList):
    """
    esta funcion recibe una lista de fechas ordenada de las apariciones de un ID
    la idea es devolver una lista de tuplas, en la que cada tupla sea asi:
    valor 1 = fecha en que aparecio en un auction
    valor 2 = cuanto tardo en volver a aparecer
    """
    distancias = []
    longitud_actual = len(dateList)
    if(longitud_actual > 1):
        for x in range(longitud_actual):
            if(x + 1 < longitud_actual):
                distancias.append([dateList[x], 
                                   (dateList[x + 1] - dateList[x]).total_seconds()
                                  ])
        
    return distancias

In [None]:
#ahora me queda cada registro asi (id, [fecha aparicion, tiempo que luego tardo en volver a aparecer])
rdd_train = rdd_train.flatMapValues(get_all_reappearances)
rdd_test = rdd_test.flatMapValues(get_all_reappearances)

In [None]:
rdd_trainWithHours = rdd_train.map(lambda x: (x[0], x[1], x[1][0].hour))
rdd_testWithHours = rdd_test.map(lambda x: (x[0], x[1], x[1][0].hour))

In [None]:
X_train = rdd_trainWithHours.map(lambda x : (x[1][0], x[2])).collect()
Y_train = rdd_trainWithHours.map(lambda x : (x[1][1])).collect()

X_test = rdd_testWithHours.map(lambda x : (x[1][0], x[2])).collect()

In [None]:
clf = RandomForestRegressor()
clf.fit(X_train, Y_train)  
y_pred = clf.predict(X_test)