In [None]:
from functools import reduce
import threading
import multiprocessing as mp
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_squared_error
from ast import literal_eval
import datetime
from datetime import datetime, timedelta
import time
import pytz
import json
import math
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from tick.plot import plot_hawkes_kernels, plot_point_process
from tick.hawkes import HawkesExpKern, HawkesADM4, SimuHawkesExpKernels, SimuPoissonProcess

In [None]:
dataset = pd.read_csv(str(input()))
dataset

In [None]:
#dataset.drop(['D'], axis=1, inplace = True)
#dataset

In [None]:
timestampColumns = ['A', 'C', 'L']

In [None]:
startDate = datetime.strptime(str(input("Start Date (Format: YYYY-MM-DD): ")), "%Y-%m-%d").astimezone(pytz.utc)
startDate

In [None]:
endDate = datetime.strptime(str(input("End Date (Format: YYYY-MM-DD): ")), "%Y-%m-%d").astimezone(pytz.utc)
endDate

In [None]:
dayAfterEndDate = float((time.mktime(endDate.timetuple()) / 3600) - (time.mktime(startDate.timetuple()) / 3600))
dayAfterEndDate

In [None]:
def calculateActualIntensities(timestampList):
    result = {}
    for timePoint in timestampList:
        if math.floor(timePoint) not in result.keys():
            result[math.floor(timePoint)] = 0
        result[math.floor(timePoint)] = result[math.floor(timePoint)] + 1
    
    return result

for column in timestampColumns:
    dataset[column] = dataset[column].apply(lambda x: literal_eval(x))
    dataset[column] = dataset[column].apply(sorted)
    dataset['actual_intensities_' + column] = dataset[column].apply(calculateActualIntensities)

dataset

In [None]:
def calculateAverageIntensityPerHour(intensityList):
    result = 0
    for h in intensityList.keys():
        result = result + intensityList[h]
    return result / (math.floor(dayAfterEndDate) + 1)

for timestampColumn in timestampColumns:
    dataset['average_intensity_' + timestampColumn] = dataset['actual_intensities_' + timestampColumn].apply(calculateAverageIntensityPerHour)

dataset

In [None]:
def calculatePoissonIntensities(df):
    learner = SimuPoissonProcess(intensities=np.array([df['average_intensity_' + timestampColumn] for timestampColumn in timestampColumns]), end_time=dayAfterEndDate)
    learner.track_intensity(intensity_track_step = 1)
    learner.simulate()
    return learner.tracked_intensity

dataset['intensities'] = dataset.apply(calculatePoissonIntensities, axis = 1)
dataset

In [None]:
def calculatePoissonIntensityTimes(df):
    learner = SimuPoissonProcess(intensities=np.array([df['average_intensity_' + timestampColumn] for timestampColumn in timestampColumns]), end_time=dayAfterEndDate)
    learner.track_intensity(intensity_track_step = 1)
    learner.simulate()
    return learner.intensity_tracked_times

dataset['intensity_times'] = dataset.apply(calculatePoissonIntensityTimes, axis = 1)
dataset

In [None]:
def splitResultsExpKern(overallResult, dimensions):
    for i in range(0, len(dimensions)):
        overallResult['intensities_' + dimensions[i]] = overallResult['intensities'].apply(lambda b: b[i].tolist())
    
    overallResult.drop(['intensities'], axis = 1, inplace = True)
    overallResult['intensity_times'] = overallResult['intensity_times'].apply(lambda b: b.tolist())
    
    return overallResult

In [None]:
dataset = splitResultsExpKern(dataset, timestampColumns)
dataset

In [None]:
for timestampColumn in timestampColumns:
    dataset['intensities_' + timestampColumn] = dataset.apply(lambda df: dict(zip(df['intensity_times'], df['intensities_' + timestampColumn])), axis = 1)

dataset

In [None]:
def fixTimestamps(df, column):
    result = {}
    for h in df[column].keys():
        if math.floor(h) not in result.keys():
            result[math.floor(h)] = 0
        result[math.floor(h)] = result[math.floor(h)] + df[column][h]
    return result

for dim in timestampColumns:
    dataset['intensities_' + dim] = dataset.apply(lambda df: fixTimestamps(df, 'intensities_' + dim), axis = 1)

dataset

In [None]:
def completeTimestamps(df, colName):
    result = {}
    for t in range(0, math.floor(dayAfterEndDate) + 1):
        if t not in df[colName].keys():
            result[t] = 0
        else:
            result[t] = df[colName][t]
    return result

for colName in ['actual_intensities_', 'intensities_']:
    for dim in timestampColumns:
        dataset[colName + dim] = dataset.apply(lambda df: completeTimestamps(df, colName + dim), axis = 1)

dataset

In [None]:
modalities = dataset['Modality'].unique()

In [None]:
def combineIntensityDictionaries(x, y):
    combinedDictionary = {}
    
    for key in x.keys():
        if key not in combinedDictionary.keys():
            combinedDictionary[key] = 0
        combinedDictionary[key] = combinedDictionary[key] + x[key]
    for key in y.keys():
        if key not in combinedDictionary.keys():
            combinedDictionary[key] = 0
        combinedDictionary[key] = combinedDictionary[key] + y[key]
    
    return combinedDictionary
    

def sumIntensities(series):
    return reduce(combineIntensityDictionaries, series)

intensityColumns = {}
for dimension in timestampColumns:
    intensityColumns['actual_intensities_' + dimension] = sumIntensities
    intensityColumns['intensities_' + dimension] = sumIntensities

intensityPlotDataset = dataset.groupby(['Modality']).agg(intensityColumns).reset_index()
intensityPlotDataset

In [None]:
courseStudentCount = dataset.groupby(['Modality'])['course_code'].count().to_frame('total').reset_index()
courseStudentCount

In [None]:
intensityPlotDataset = intensityPlotDataset.merge(courseStudentCount, on=['Modality'])
intensityPlotDataset

In [None]:
def getAverageIntensities(df, column):
    result = {}
    for k in sorted([key for key in df[column].keys()]):
        result[k] = df[column][k] / df['total']
    return result

for column in ['actual_intensities_' + dimension for dimension in timestampColumns] + ['intensities_' + dimension for dimension in timestampColumns]:
    intensityPlotDataset[column] = intensityPlotDataset.apply(lambda df: getAverageIntensities(df, column), axis = 1)

intensityPlotDataset

In [None]:
for modality in modalities:
    fig, axs = plt.subplots(len(timestampColumns), 1)
    intensityPlotDatasetModality = intensityPlotDataset.loc[intensityPlotDataset['Modality'] == modality]
    
    for i in range(0, len(timestampColumns)):
        axs[i].plot([t for t in intensityPlotDatasetModality.iloc[0]['actual_intensities_' + timestampColumns[i]].keys()], [v for v in intensityPlotDatasetModality.iloc[0]['actual_intensities_' + timestampColumns[i]].values()], color='tab:blue')
        axs[i].plot([t for t in intensityPlotDatasetModality.iloc[0]['intensities_' + timestampColumns[i]].keys()], [v for v in intensityPlotDatasetModality.iloc[0]['intensities_' + timestampColumns[i]].values()], color='tab:orange')
        axs[i].set_title(timestampColumns[i])
    
    for ax in axs.flat:
        ax.set(xlabel='Time', ylabel='Intensity')
    
    fig.legend(handles=[Line2D([0], [0], color='tab:blue', label='Actual Intensities'), Line2D([0], [0], color='tab:orange', label='Est. Intensities')], loc='lower center')
    fig.tight_layout(pad=1.0)
    fig.set_figwidth(10)
    fig.set_figheight(10)
    fig.savefig(modality + '_Intensities_Poisson.png')
    plt.close()

In [None]:
for dim in timestampColumns:
    dataset['rmse_' + dim] = dataset.apply(lambda df: mean_squared_error([v for v in df['actual_intensities_' + dim].values()], [v for v in df['intensities_' + dim].values()], squared=False), axis = 1)

dataset

In [None]:
averageAgg = {}
for dim in timestampColumns:
    averageAgg['rmse_' + dim] = np.mean

rmseModalities = dataset.groupby(['Modality']).agg(averageAgg).reset_index()
rmseModalities

In [None]:
rmseModalities.to_csv('RMSE_Poisson.csv', header = True, index = False)