In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import math
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv("../../resource/ModelCreation/worldVaccinesCleanSimple.csv")

Make date into integer of days since the first date in the dataset

In [2]:
data.date = pd.to_datetime(data.date,format="%Y-%m-%d") # Convert date strings to datetime objects
mindate1 = data.date.min() # Save min for when we take in input
data.date = data.date - data.date.min()
data.date = pd.Series([x.days for x in data.date])

Make people_fully_vaccinated_per_hundred between the range of 0 and 1

In [3]:
data["people_fully_vaccinated_per_hundred"] = data["people_fully_vaccinated_per_hundred"]/100

Put data into a dictionary with keys as countries and a dictionary with the data as the value

In [4]:
datadict = dict()
for c in data.country.unique():
    tmpdata = data.loc[data.country==c] # Get the data where the country column equals the certain country
    tmpdata.drop("country",axis=1,inplace=True) # Drop the country column because this value will be the same for this table
    datadict[c] = {"data":tmpdata} # Set the value for the key at country c to be the data we just extracted

First, let's compare how my original Linear Regression model with transformations to turn it into logistic stacks up against a 1 neuron neural network with a sigmoid function.

In [5]:
class RegressionModel:
    # Initialize data with trainX and triany, then find the correct equation to regress on
    def __init__(self,trainX,trainy,testX,testy):
        # By default, save all the train and test X and y as a fully random split of X and y
        self.trainX = trainX
        self.trainy = trainy
        self.testX = testX
        self.testy = testy
        self.fit()
        
    def fit(self):
        trainy = self.transformYFit(self.trainy.copy()) # Transform training x values
        testy = self.transformYFit(self.testy.copy()) # Transform testing x values
        self.model = LinearRegression().fit(self.trainX,trainy) # Make a model fit with the training x and y values
        self.score = self.model.score(self.testX,testy)
        
    def transformYFit(self, Y):
        # Create a lambda with formula to transform values.
        transformation = lambda y: -1*math.log((1/(y+.01))-1)
        return [transformation(y) for y in Y]
    
    def transformYPredict(self, Y):
        transformation = lambda y: (1/(1+math.exp(-1*y)))-.01
        return [transformation(y) for y in Y]
    
    def getScore(self):
        return self.score
        
    def predict(self,X,y):
        return (self.transformYPredict(self.model.predict(tmpX)), self.model.score(tmpX,y)) # Return the predicted y, but transformed to be betweeen 0 and 1, and the score
    
    def predictXOnly(self,X):
        return self.transformYPredict(self.model.predict(X))
    
    def finalFit(self,X,y):
        # Initialize training and test X and y to X and y.
        self.trainX = X
        self.testX = X
        self.trainy = y
        self.testy = y
        self.fit(self.degree)

In [39]:
class PolyLogisticRegressionModel:
    # Initialize data with trainX and triany, then find the correct equation to regress on
    def __init__(self,trainX,trainy,testX,testy):
        # By default, save all the train and test X and y as a fully random split of X and y
        self.trainX = trainX
        self.trainy = trainy
        self.testX = testX
        self.testy = testy
        self.findRegress()
        
    def findRegress(self):
        # Set an initial model and score from fitting our model
        self.model, self.score = self.fit(1)
        self.degree = 1
        # Loop through all possible boolean values
        for d in range(1,10,2):
            # Run the fit function
            tmpmodel, tmpscore = self.fit(d)
            # Check if this new model has a better score
            if tmpscore > self.score:
                # If the new model has a better score, update the model, score, and regress_bool
                self.model = tmpmodel
                self.score = tmpscore
                self.degree = d
        
        
    def fit(self, degree):
        trainX = self.transformX(self.trainX.copy(), degree) # Transform training x values
        testX = self.transformX(self.testX.copy(), degree) # Transform testing x values
        trainy = self.transformYFit(self.trainy.copy()) # Transform training x values
        testy = self.transformYFit(self.testy.copy()) # Transform testing x values
        model = LinearRegression().fit(trainX,trainy) # Make a model fit with the training x and y values
        score = model.score(testX,testy)
        return model, score
        
    def transformYFit(self, Y):
        # Create a lambda with formula to transform values.
        transformation = lambda y: -1*math.log((1/(y+.01))-1)
        return [transformation(y) for y in Y]
    
    def transformYPredict(self, Y):
        transformation = lambda y: (1/(1+math.exp(-1*y)))-.01
        for y in Y:
            try:
                if y > -700:
                    transformation(y)
            except:
                print(y)
        return [transformation(y) if y > -700 else 0 for y in Y]
    
    def transformX(self, X, degree):
        # Create a list of lambdas with formulas to transform values.
        newX = []
        # Loop through each row in the list
        for row in X:
            new_values = []
            # Loop through each possible transformation
            if degree == 9:
                for val in row:
                    new_values.append(math.log(val+2))
                newX.append(np.append(row, new_values))
            else: 
                for p in range(2,degree+1):
                    for val in row:
                        new_values.append(val**p)
                newX.append(np.append(row, new_values))
        return newX
    
    def getScore(self):
        return self.score
    
    def getDegree(self):
        return self.degree
        
    def predict(self,X,y):
        tmpX = self.transformX(X,self.degree)
        return (self.transformYPredict(self.model.predict(tmpX)), self.model.score(tmpX,y)) # Return the predicted y, but transformed to be betweeen 0 and 1, and the score
    
    def predictXOnly(self,X):
        return self.transformYPredict(self.model.predict(self.transformX(X,self.degree)))
    
    def finalFit(self,X,y):
        # Initialize training and test X and y to X and y.
        self.trainX = X
        self.testX = X
        self.trainy = y
        self.testy = y
        self.fit(self.degree)

In [8]:
c = "United States"
X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
y = datadict[c]["data"]["people_fully_vaccinated_per_hundred"] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)
scaler = MinMaxScaler(feature_range=(-1,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
modelReg = RegressionModel(trainX=X_train,testX=X_test,trainy=y_train,testy=y_test)
y_predReg = modelReg.predictXOnly(X_test)
modelNN = keras.Sequential() # Create Model
modelNN.add(layers.Dense(1, input_dim = 1,activation="sigmoid")) # Add Input Layer
modelNN.compile(loss = 'mae', optimizer='sgd')
result = modelNN.fit(X_train,y_train,epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
y_predNN = modelNN.predict(X_test)
mae = tf.keras.losses.MeanAbsoluteError()
datadict[c]["lossNN"] = mae(y_test, y_predNN).numpy()
datadict[c]["lossReg"] = mae(y_test,y_predReg).numpy()
print(c, "NN:",datadict[c]["lossNN"], "Reg:",datadict[c]["lossReg"])

United States NN: 0.18020727 Reg: 0.038589094


In [11]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
datadict[c]["X"] = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
datadict[c]["y"] = datadict[c]["data"]["people_fully_vaccinated_per_hundred"]
X_train, X_test, y_train, y_test = train_test_split(datadict[c]["X"],datadict[c]["y"],test_size=.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=.2*10/8)
datadict[c]["batch"] = 5
datadict[c]["model"] = keras.Sequential() # Create Model
datadict[c]["model"].add(layers.Dense(1, input_dim = 1,activation="sigmoid")) # Add Input Layer
datadict[c]["model"].compile(loss = 'mae', optimizer='sgd')
datadict[c]["result"] = datadict[c]["model"].fit(X_train,y_train,epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
mae = tf.keras.losses.MeanAbsoluteError()
y_pred = datadict[c]["model"].predict(X_valid)
datadict[c]["loss"] = mae(y_valid, y_pred).numpy()
for b in range(10,55,5):
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
    model = keras.Sequential() # Create Model
    model.add(layers.Dense(1, input_dim = 1,activation="sigmoid")) # Add Input Layer
    model.compile(loss = 'mae', optimizer='sgd')
    result = model.fit(X_train,y_train,epochs=10000,batch_size=b,verbose=0,callbacks=[callback])
    y_pred = datadict[c]["model"].predict(X_valid)
    loss = mae(y_valid, y_pred).numpy()
    if loss < datadict[c]["loss"]:
        datadict[c]["model"] = model
        datadict[c]["result"] = result
        datadict[c]["loss"] = loss
        datadict[c]["batch"] = b
print(c, "Batch:",datadict[c]["batch"])

United States Batch: 5


From this testing, we see that batch_size of 5 is best, so we will just hard code this in from now on.

In [12]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
datadict[c]["X"] = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
datadict[c]["y"] = datadict[c]["data"]["people_fully_vaccinated_per_hundred"]
X_train, datadict[c]["X_test"], y_train, datadict[c]["y_test"] = train_test_split(datadict[c]["X"],datadict[c]["y"],test_size=.2)
datadict[c]["X_train"], datadict[c]["X_valid"], datadict[c]["y_train"], datadict[c]["y_valid"] = train_test_split(X_train,y_train,test_size=.2*10/8)
datadict[c]["model"] = keras.Sequential() # Create Model
datadict[c]["model"].add(layers.Dense(16, input_dim = 1,activation="sigmoid")) # Add Input Layer
datadict[c]["model"].add(layers.Dense(1,activation="sigmoid"))
datadict[c]["model"].compile(loss = 'mae', optimizer='sgd')
datadict[c]["result"] = datadict[c]["model"].fit(datadict[c]["X_train"],datadict[c]["y_train"],epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
mae = tf.keras.losses.MeanAbsoluteError()
y_pred = datadict[c]["model"].predict(datadict[c]["X_valid"])
datadict[c]["loss"] = mae(datadict[c]["y_valid"], y_pred).numpy()
datadict[c]["hidden"] = 0
datadict[c]["size"] = 1
for h in range(4):
    for s in range(1,17):
        model = keras.Sequential() # Create Model
        model.add(layers.Dense(s, input_dim = 1,activation="sigmoid")) # Add Input Layer
        for _ in range(h):
            model.add(layers.Dense(s,activation="sigmoid"))
        model.add(layers.Dense(1,activation="sigmoid"))
        model.compile(loss = 'mae', optimizer='sgd')
        result = model.fit(datadict[c]["X_train"],datadict[c]["y_train"],epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
        y_pred = datadict[c]["model"].predict(X_valid)
        loss = mae(datadict[c]["y_valid"], y_pred).numpy()
        if loss < datadict[c]["loss"]:
            datadict[c]["model"] = model
            datadict[c]["result"] = result
            datadict[c]["loss"] = loss
            datadict[c]["hidden"] = h
            datadict[c]["size"] = s
print(datadict[c]["hidden"], datadict[c]["size"])

KeyError: 'hidden'

From this, we see that it is better to just have the one input layer and output, and just 16 neurons.

In [42]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
y = datadict[c]["data"]["people_fully_vaccinated_per_hundred"] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)
scaler = MinMaxScaler(feature_range=(-1,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
modelReg = RegressionModel(trainX=X_train,testX=X_test,trainy=y_train,testy=y_test)
y_predReg = modelReg.predictXOnly(X_test)
modelNN = keras.Sequential() # Create Model
modelNN.add(layers.Dense(16, input_dim = 1,activation="sigmoid")) # Add Input Layer
modelNN.add(layers.Dense(1,activation="sigmoid"))
modelNN.compile(loss = 'mae', optimizer='sgd')
result = modelNN.fit(X_train,y_train,epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
y_predNN = modelNN.predict(X_test)
mae = tf.keras.losses.MeanAbsoluteError()
datadict[c]["lossNN"] = mae(y_test, y_predNN).numpy()
datadict[c]["lossReg"] = mae(y_test,y_predReg).numpy()
    
print(c, "NN:",datadict[c]["lossNN"], "Reg:",datadict[c]["lossReg"])

United States NN: 0.17358632 Reg: 0.044319764


In [6]:
"""callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
for c in datadict.keys():
    X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
    y = datadict[c]["data"]["people_fully_vaccinated_per_hundred"] 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    modelReg = RegressionModel(trainX=X_train,testX=X_test,trainy=y_train,testy=y_test)
    y_predReg = modelReg.predictXOnly(X_test)
    modelNN = keras.Sequential() # Create Model
    modelNN.add(layers.Dense(1, input_dim = 1,activation="sigmoid")) # Add Input Layer
    modelNN.compile(loss = 'mae', optimizer='sgd')
    result = modelNN.fit(X_train,y_train,epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
    y_predNN = modelNN.predict(X_test)
    mae = tf.keras.losses.MeanAbsoluteError()
    datadict[c]["lossNN"] = mae(y_test, y_predNN).numpy()
    datadict[c]["lossReg"] = mae(y_test,y_predReg).numpy()

for c in datadict.keys():
    print(c, "NN:",datadict[c]["lossNN"], "Reg:",datadict[c]["lossReg"])"""

KeyboardInterrupt: 

From this we can see that just using a NN to try to create a logistic curve results in far worse mae. However, now we will test on different sizes

Create ANN test what a good batch_size is, using a validation set

In [None]:
"""callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
for c in datadict.keys():
    X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
    datadict[c]["X"] = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
    datadict[c]["y"] = datadict[c]["data"]["people_fully_vaccinated_per_hundred"]
    X_train, datadict[c]["X_test"], y_train, datadict[c]["y_test"] = train_test_split(datadict[c]["X"],datadict[c]["y"],test_size=.2)
    datadict[c]["X_train"], datadict[c]["X_valid"], datadict[c]["y_train"], datadict[c]["y_valid"] = train_test_split(X_train,y_train,test_size=.2*10/8)
    datadict[c]["batch"] = 5
    datadict[c]["model"] = keras.Sequential() # Create Model
    datadict[c]["model"].add(layers.Dense(1, input_dim = 1,activation="sigmoid")) # Add Input Layer
    datadict[c]["model"].compile(loss = 'mae', optimizer='sgd')
    datadict[c]["result"] = datadict[c]["model"].fit(datadict[c]["X_train"],datadict[c]["y_train"],epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
    mae = tf.keras.losses.MeanAbsoluteError()
    y_pred = datadict[c]["model"].predict(X_valid)
    datadict[c]["loss"] = mae(y_valid, y_pred).numpy()
    for b in range(10,55,5):
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
        model = keras.Sequential() # Create Model
        model.add(layers.Dense(1, input_dim = 1,activation="sigmoid")) # Add Input Layer
        model.compile(loss = 'mae', optimizer='sgd')
        result = model.fit(datadict[c]["X_train"],datadict[c]["y_train"],epochs=10000,batch_size=b,verbose=0,callbacks=[callback])
        y_pred = datadict[c]["model"].predict(X_valid)
        loss = mae(datadict[c]["y_valid"], datadict[c]["y_pred"]).numpy()
        if loss < datadict[c]["loss"]:
            datadict[c]["model"] = model
            datadict[c]["result"] = result
            datadict[c]["loss"] = loss
            datadict[c]["batch"] = b
    print(c, "NN:",datadict[c]["lossNN"], "Reg:",datadict[c]["lossReg"])"""

From this testing, we see that batch_size of 5 is best, so we will just hard code this in from now on.

In [None]:
"""callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
for c in datadict.keys():
    X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
    datadict[c]["X"] = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
    datadict[c]["y"] = datadict[c]["data"]["people_fully_vaccinated_per_hundred"]
    X_train, datadict[c]["X_test"], y_train, datadict[c]["y_test"] = train_test_split(datadict[c]["X"],datadict[c]["y"],test_size=.2)
    datadict[c]["X_train"], datadict[c]["X_valid"], datadict[c]["y_train"], datadict[c]["y_valid"] = train_test_split(X_train,y_train,test_size=.2*10/8)
    datadict[c]["model"] = keras.Sequential() # Create Model
    datadict[c]["model"].add(layers.Dense(16, input_dim = 1,activation="sigmoid")) # Add Input Layer
    datadict[c]["model"].add(layers.Dense(1,activation="sigmoid"))
    datadict[c]["model"].compile(loss = 'mae', optimizer='sgd')
    datadict[c]["result"] = datadict[c]["model"].fit(datadict[c]["X_train"],datadict[c]["y_train"],epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
    mae = tf.keras.losses.MeanAbsoluteError()
    y_pred = datadict[c]["model"].predict(datadict[c]["X_valid"])
    datadict[c]["loss"] = mae(datadict[c]["y_valid"], y_pred).numpy()
    for h in range(4):
        for s in range(1,17):
            model = keras.Sequential() # Create Model
            model.add(layers.Dense(s, input_dim = 1,activation="sigmoid")) # Add Input Layer
            for _ in range(h):
                model.add(layers.Dense(s,activation="sigmoid"))
            model.add(layers.Dense(1,activation="sigmoid"))
            model.compile(loss = 'mae', optimizer='sgd')
            result = model.fit(datadict[c]["X_train"],datadict[c]["y_train"],epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
            y_pred = datadict[c]["model"].predict(X_valid)
            loss = mae(datadict[c]["y_valid"], y_pred).numpy()
            if loss < datadict[c]["loss"]:
                datadict[c]["model"] = model
                datadict[c]["result"] = result
                datadict[c]["loss"] = loss
                datadict[c]["hidden"] = h
                datadict[c]["size"] = s
    print(datadict[c]["hidden"], datadict[c]["size"])"""

From this, we see that it is better to just have the one input layer and output, and just 16 neurons.

In [None]:
"""callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=100)
for c in datadict.keys():
    X = datadict[c]["data"]["date"].to_numpy().reshape(-1,1)
    y = datadict[c]["data"]["people_fully_vaccinated_per_hundred"] 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    modelReg = RegressionModel(trainX=X_train,testX=X_test,trainy=y_train,testy=y_test)
    y_predReg = modelReg.predictXOnly(X_test)
    modelNN = keras.Sequential() # Create Model
    modelNN.add(layers.Dense(16, input_dim = 1,activation="sigmoid")) # Add Input Layer
    modelNN.add(layers.Dense(1,activation="sigmoid"))
    modelNN.compile(loss = 'mae', optimizer='sgd')
    result = modelNN.fit(X_train,y_train,epochs=10000,batch_size=5,verbose=0,callbacks=[callback])
    y_predNN = modelNN.predict(X_test)
    mae = tf.keras.losses.MeanAbsoluteError()
    datadict[c]["lossNN"] = mae(y_test, y_predNN).numpy()
    datadict[c]["lossReg"] = mae(y_test,y_predReg).numpy()
    
for c in datadict.keys():
    print(c, "NN:",datadict[c]["lossNN"], "Reg:",datadict[c]["lossReg"])"""