In [None]:
#!/usr/bin/env python
# coding: utf-8




import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pandas import concat
from tensorflow import keras
from time import localtime, strftime
import tensorflow as tf
import kerastuner as kt
import statistics






def getAllData(targetVar, path):
    
    df = pd.read_csv(path)
    
    print(df.columns)
    

    col = df.columns.tolist()
    
    col.remove(targetVar)
    
    col += [targetVar] 
     
    return df[col]


def splitDataOnMonths(df):
    """Splits a dataframe into train, validation and test dataframes. 
    The data is split on specific months to avoid temporal overlapping.
    """

    maxMonth = df["month"].max()
    
    #GLOBALT
    
    #(start, stop)
    #trainMonths = (0, 264)
    
    #valMonths = (265, 300)
    
    #testMonths = (301, 336)
    
    ##AFRIKA
    
    trainMonths = (0, 287)
    
    valMonths = (288, 323)
    
    testMonths = (324, 359)
    
    
    def getMonthsBetween(df, start, stop):
        return df[(df["month"] >= start) & (df["month"] <= stop) ]
    
    
    train_df = getMonthsBetween(df, trainMonths[0], trainMonths[1])
    test_df = getMonthsBetween(df, testMonths[0], testMonths[1])
    val_df = getMonthsBetween(df, valMonths[0], valMonths[1])
    
    print(train_df)
    print(val_df)
    print(test_df)
    
    return train_df, test_df, val_df

    

 
 #%%
def df_to_timeseriesMatrix(df, windowSize, nShifts, targetVar):
    """Creates time series matrix appropiate for supervised learning.
    """

    #Tilldelar Y målvariablen.
    Y = df[targetVar] 
    
    #Tar bort målvariablen från resten av datasetet
    col = df.columns.tolist()
    col.remove(targetVar)
    df = df[col]
    
    
    #Antalet attribut/variabler
    n_variables = len(df.columns)
   
    
    
    cols, names = list(), list()
    
	
    #Skapar tidsseriematrisen
    for i in range(windowSize, 0, -1):
        cols.append(df.shift(i))
        names += [('var{0}(t-{1})'.format(j+1, i-1)) for j in range(n_variables)]
	
    #Om shifts är mindre än eller lika med noll så kommer den inte shifta. Men är minst shiftad 1. 
    if nShifts <= 0:
        nShifts = -1
    
    
    #Siftar Y så många steg in i framtiden som ska predikteras
    Y = Y.shift(-nShifts+1)
    
    #Lägger till Y i matrisen
    cols.append(Y)
    names+=["Y"]
    
	# sätt ihop
    agg = concat(cols, axis=1)
    agg.columns = names
    
    #sortera
    #agg = agg.sort_index(axis=1, ascending=False)
    
    #Ta bort windowSize första raderna (som har minst en NaN)
    agg.dropna(inplace=True)

    return agg



def createNsplitTimeMatrix(df, windowSize, nShifts, targetVar):
    """Splits a time series matrix in predictors and target values.
    """
    matrix = df_to_timeseriesMatrix(df, windowSize, nShifts, targetVar)
    

    dataY = matrix["Y"].values
    matrix = matrix.drop(columns=["Y"])
    dataX = matrix.values

    return dataX, dataY





def restructureData(df, windowSize, nShifts, targetVar):
    """Splits a dataframe into predictors X, and target values Y. 
    Every country is made into samples one by one to avoid spatial overlapping. 
    The target values are shifted nShift times.
    
    """
    countryMax = df["country_id"].max()

    X = np.array([])
    Y = np.array([])
    flag = 0
    countries = list(set(df["country_id"].values))
    
    for c_id in countries:
        df_single_country = df[df["country_id"] == c_id]
        c_X, c_Y = createNsplitTimeMatrix(df_single_country, windowSize, nShifts, targetVar)
        
        if flag == 0:
            X = c_X
            Y = c_Y
            flag = 1
        else:
            X = np.concatenate((X, c_X))
            Y = np.concatenate((Y, c_Y))
        
 
    return X, Y
       
    




def buildModel(hiddenLayers, unitsPerHL, unitsInputLayer, batch_size=32): 
    """Builds the model.
    """
    METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auroc', curve='ROC'),
      keras.metrics.AUC(name='aupr', curve='PR'),

]
    
    #optimizer
    ADAM = tf.keras.optimizers.Adam(
    learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=10e-6, amsgrad=False,
    name='Adam'
    )
    
    
    
    model = keras.models.Sequential()
    
    #storlekn på antal features
    #input layer
    model.add(keras.layers.Dense(unitsInputLayer, activation='relu', batch_size=batch_size))
    
    #hidden layers
    for i in range(hiddenLayers):
        model.add(keras.layers.Dense(unitsPerHL, activation='relu',batch_size=batch_size))
        

    #Output layers
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer=ADAM, loss='binary_crossentropy', metrics=METRICS)
    
    return model


def build_tuner_model(hp):
    """Tuner function. Is used to find the optimal hyperparameters. 
    Max and min values for hyperparameters needs to be changed here.
    """
    #min, max, step
    #unitsInput = hp.Int('unitsInputLayer', 4, 15, step=2)
    unitsHL = hp.Int('unitsPerHL', 4, 11, step=1)
    hiddenLayers = hp.Int('hiddenLayers', 1, 5, step=1)
    #batch size
    bs = hp.Int('batch_size', 16, 40, step=4)
    
    METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auroc', curve='ROC'),
      keras.metrics.AUC(name='aupr', curve='PR'),

    ]
    
    #optimizer
    ADAM = tf.keras.optimizers.Adam(
    learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=10e-6, amsgrad=False,
    name='Adam'
    )

    model = keras.models.Sequential()
    model.add(keras.layers.Dense(unitsHL, activation='relu', batch_size=bs))
    
    for i in range(hiddenLayers):
        
        model.add(keras.layers.Dense(unitsHL, activation='relu',batch_size=bs))
    
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=ADAM,loss='binary_crossentropy',metrics=METRICS)
    
    return model


def fitModel(model, trainX, trainY, valX, valY, epo, batch_size,  shuffle=False):
    """Fit model. Returns model and metric history.
    """
    # fit network
    history = model.fit(x=trainX, y=trainY,  validation_data=(valX, valY), epochs=epo, verbose=2, shuffle=shuffle, batch_size=batch_size)
    
    return model, history




def predictModel(model, testX, nShifts, batch_size):
    """Precit model
    """
    
    print(testX.shape)
    res = model.predict(testX, batch_size=batch_size)
    
    return res



def plotPredict(res, testY, nShifts, windowSize, epochs, hiddenLayers, unitsPerHL, unitsInputLayer, batch_size, timeStamp, shuffle, onlyAfricaTest, onlyAfricaVal, onlyAfricaTrain) :
    """Plot predicitons and actual data. The hyperparameters that were used is shown at the top.
    """
    
    windowStart = 0
    windowsStop = 205
    predictStart = 130 - windowStart
    predVal = testY[windowsStop:windowsStop+nShifts]    
    
    predVal_plot  = np.append(np.array([np.nan]*(windowsStop-windowStart)), predVal)
    res_plot = np.append(np.array([np.nan]*(predictStart-1)), res.tolist())
    
    
    print("Y-shape: "+ str(testY.shape))
    #testY = testY[nShifts:]
    
    plt.plot(res_plot, "+r", label='prediction')
    plt.plot(testY[windowStart:windowsStop], label='actual')
    plt.plot(predVal_plot, label="future val")
    
    plt.xlabel('Steps (months?)')
    plt.ylabel('Prob. of conflict')
    
    title = "{0}\n nShifts: {1} windowSize: {2} epochs: {3}\n  hiddenLayers: {4}  unitsPerHL: {5}  unitsInputLayer: {6} \n batch_size: {7} shuffle: {8} \n onlyAfricaTest: {9} onlyAfricaVal: {10} onlyAfricaTrain: {11}".format(timeStamp, nShifts, windowSize, epochs, hiddenLayers, unitsPerHL, unitsInputLayer, batch_size, shuffle, onlyAfricaTest, onlyAfricaVal, onlyAfricaTrain)
  
    plt.title(title)
   
    plt.legend()
    plt.show()



def plotMetrics(history, timeStamp):
    """Plot metrics from fitting.
    """
    #print(history.history.keys())
    auroc = history.history['auroc']
    val_auroc = history.history['val_auroc']
    
    precision = history.history['precision']
    val_precision = history.history['val_precision']
    
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    aupr = history.history['aupr']
    val_aupr = history.history['val_aupr']
    
    fig, axs = plt.subplots(2, 2)
    axs[0, 0].plot(auroc, label="Training", color="g")
    axs[0, 0].plot(val_auroc, label="Validation")
    axs[0, 0].set_title("AUROC")
    
    
    axs[1, 0].plot(precision, color="g")
    axs[1, 0].plot(val_precision)
    axs[1, 0].set_title("Precision")
    
    #axs[0, 1].plot(accuracy, color="g")
    #axs[0, 1].plot(val_accuracy)
    #axs[0, 1].set_title("Accuracy")
    
    axs[0, 1].plot(aupr, color="g")
    axs[0, 1].plot(val_aupr)
    axs[0, 1].set_title("AUPR")
    
    axs[1, 1].plot(loss, color="g")
    axs[1, 1].plot(val_loss)
    axs[1, 1].set_title("Loss")
    fig.legend()
    fig.suptitle(timeStamp)
    
    fig.tight_layout()

def getEvaluateMetric(model, score):
    """Create dictionary of metric score and metric name from the model.evaluate method.
    """
    
    metrics = {}
    for metric_name, value in zip(model.metrics_names, score):
        metrics[metric_name] = value
        
    return metrics

def getAverageFitMetric(history):
    
    mean_metrics = {}
    for key in history.history.keys():
        mean_metrics[key] = statistics.mean(history.history[key])
        
    return mean_metrics

def calcBrierScore(results, testY):
    
    brierPerPrediction = []
    results = [r for sublist in results for r in sublist]
    
    for p, v in zip(results, testY):
        brier = (p-v)**2
        brierPerPrediction.append(brier)
    
        
    return statistics.mean(brierPerPrediction)
    
    
    
def onlyAfricaFilter(df, only=False):
    """Filter dataframe on countries outside of africa.
    """
    if only:
        df = df.loc[(df['e_regiongeo'] == 5) | (df['e_regiongeo'] == 6) | (df['e_regiongeo'] == 7)| (df['e_regiongeo'] == 8)| (df['e_regiongeo'] == 9)]
        #Resets row indexing.
        df = df.reset_index(drop=True)

    return df
    

#https://machinelearningmastery.com/save-load-keras-deep-learning-models/
def saveModel(model, name):
    """Save model.
    """
    model_json = model.to_json()
    with open(str(name)+".json", "w") as json_file:
        json_file.write(model_json)
        
 
    model.save_weights(str(name)+".h5")
    print("Model saved")



def loadModel(name):
    """Load saved model.
    """
    
    
    json_file = open(str(name)+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = tf.model_from_json(loaded_model_json)

    model.load_weights(str(name)+".h5")
    
    return model, args_text


def saveArgs(name, args):
    f = open("{0}_args.txt".format(name),"w+")
    f.write(str(args))
    
def loadArgs(name):
    f = open("{0}_args.txt".format(name),"r")
    args_text = f.read()
    return args_text


#%%

def runModel(args):
    targetVar = args["targetVar"]
    onlyAfricaTest = args["onlyAfricaTest"]
    onlyAfricaVal = args["onlyAfricaVal"]
    onlyAfricaTrain = args["onlyAfricaTrain"]
    windowSize = args["windowSize"]
    nShifts = args["nShifts"]
    epochs = args["epochs"]
    hiddenLayers = args["hiddenLayers"]
    unitsPerHL = args["unitsPerHL"]
    unitsInputLayer = args["unitsInputLayer"]
    shuffle = args["shuffle"]
    batch_size = args["batch_size"]
    timeStamp = args["timeStamp"]
    
    
    #Fetch data
    df = getAllData(targetVar, path)
    
    #Remove the last two years. ViEWS test period is ending dec 2016.
    #df = df[df["month"] <= 336]
    
    print(df.columns)

    #Split in train, test, val
    train_df, test_df, val_df = splitDataOnMonths(df)
    
    #Filter data on africa
    test_df = onlyAfricaFilter(test_df, onlyAfricaTest)
    val_df = onlyAfricaFilter(val_df, onlyAfricaTest)
    train_df = onlyAfricaFilter(train_df, onlyAfricaTrain)
    
    #Remove 'month'-column
    train_df = train_df.drop(columns=["month"])
    val_df = val_df.drop(columns=["month"])
    test_df = test_df.drop(columns=["month"])
    
    
    
    #Same scaler for all three data sets. 
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    #Create training set
    trainX, trainY = restructureData(train_df, windowSize, nShifts, targetVar)
    print("Training set: Done "+ str(trainX.shape))
    
    #Normalize
    trainX_scaled = scaler.fit_transform(trainX)
   
    #Create val set
    valX, valY = restructureData(val_df, windowSize, nShifts, targetVar)
    print("Val set: Done "+ str(valX.shape))
    
    #Normalize
    valX_scaled = scaler.transform(valX)
    
    #Create test set
    testX, testY = restructureData(test_df, windowSize, nShifts, targetVar)
    print("Test set: Done "+ str(testX.shape))
    
    #Normalize
    testX_scaled = scaler.transform(testX)
    
    #Tune network by finding optimal hyperparameters. 
    if args["kerasTuner"]:
        
        max_epochs = 100
        
        hp = kt.HyperParameters()      
        
        tuner = kt.Hyperband(build_tuner_model,'val_loss',max_epochs,hyperband_iterations=2)

        tuner.search(trainX_scaled, trainY,validation_data=(valX_scaled, valY))
        
        hyperp = tuner.get_best_hyperparameters()
       
        tuner.results_summary()
        
        return False, False

      
    else:
        #Normal run
        #Build
        model_1 = buildModel(hiddenLayers, unitsPerHL, unitsInputLayer, batch_size)
        #Fit
        model_1, history = fitModel(model_1, trainX_scaled, trainY, valX_scaled, valY, epochs, batch_size, shuffle=shuffle)
        
        #Print nework structure
        model_1.summary()
        
        
        test_eval = model_1.evaluate(testX_scaled, testY, batch_size=batch_size)
        
        #Predict.
        res1 = predictModel(model_1, testX_scaled, nShifts, batch_size)
        
        #Plot predicitons. 
        #plotPredict(res1, testY, nShifts, windowSize, epochs, hiddenLayers, unitsPerHL, unitsInputLayer, batch_size, timeStamp, shuffle, onlyAfricaTest, onlyAfricaVal, onlyAfricaTrain)
        
        #Plot metrics.
        plotMetrics(history, timeStamp)
        #print(args)
        
     
        
        metrics = {"Brier": calcBrierScore(res1, testY), "Metrics": getEvaluateMetric(model_1, test_eval)}
        
        return model_1, metrics
    










In [None]:
path = "full_dataset.csv"




In [None]:
getAllData("dummy_type_1", path)["month"]

AFRIKA--------------------------------------------------

In [None]:



args_t1_sb_A = {
        "targetVar" : "dummy_type_1",
        "onlyAfricaTest" : True,
        "onlyAfricaVal" : False,
        "onlyAfricaTrain" : False,
        #Storleken på fönstret som ska inkluderas i ett sample.
        "windowSize" : 3,
        #Antalet förskjutningar.
        "nShifts" : 1, 
        "epochs" : 7,
        "hiddenLayers" : 2,
        "unitsPerHL" : 6,
        "unitsInputLayer" : 6,
        #Blanda samples?
        "shuffle" : True,
        "batch_size" : 40,
        "timeStamp" : strftime("%Y-%m-%d %H:%M:%S", localtime()),
        "kerasTuner": False
}

model_t1_sb_A, metrics_t1_sb_A = runModel(args_t1_sb_A)
metrics_t1_sb_A

In [None]:
args_t6_sb_A = {
        "targetVar" : "dummy_type_1",
        "onlyAfricaTest" : True,
        "onlyAfricaVal" : False,
        "onlyAfricaTrain" : False,
        #Storleken på fönstret som ska inkluderas i ett sample.
        "windowSize" : 3,
        #Antalet förskjutningar.
        "nShifts" : 6, 
        "epochs" : 6,
        "hiddenLayers" : 3,
        "unitsPerHL" : 7,
        "unitsInputLayer" : 6,
        #Blanda samples?
        "shuffle" : True,
        "batch_size" : 40,
        "timeStamp" : strftime("%Y-%m-%d %H:%M:%S", localtime()),
        "kerasTuner": False
}

model_t6_sb_A, metrics_t6_sb_A = runModel(args_t6_sb_A)
metrics_t6_sb_A

In [None]:
args_t12_sb_A = {
        "targetVar" : "dummy_type_1",
        "onlyAfricaTest" : True,
        "onlyAfricaVal" : False,
        "onlyAfricaTrain" : False,
        #Storleken på fönstret som ska inkluderas i ett sample.
        "windowSize" : 3,
        #Antalet förskjutningar.
        "nShifts" : 12, 
        "epochs" : 7,
        "hiddenLayers" : 1,
        "unitsPerHL" : 5,
        "unitsInputLayer" : 6,
        #Blanda samples?
        "shuffle" : True,
        "batch_size" : 40,
        "timeStamp" : strftime("%Y-%m-%d %H:%M:%S", localtime()),
        "kerasTuner": False
}

model_t12_sb_A, metrics_t12_sb_A = runModel(args_t12_sb_A)
metrics_t12_sb_A