#### TestQ

In [91]:
#!/bin/python3

import math
import os
import random
import re
import sys

import pandas as pd

def calcMissing(readings):
    """
    Compute Mercury Levels for missing rows in input data.
    """
    # Read data
    data, index, missing_idx = [], [], []
    for i, r in enumerate(readings):
        seq = r.split(" ")
        date, time, level = seq[0], seq[1], seq[-1]
        date = " ".join([date, time])

        #seq = r.split("\t")
        #date, level = seq

        index.append(date)
        # Remember indices of missing values
        if level.startswith("Missing"):
            missing_idx.append(i)
            level = pd.NA
        else:
            level = float(level)

        data.append(level)

    # Set-up pandas dataframe
    df = pd.DataFrame(data, index=index, columns=["level"])
    df["level"] = pd.to_numeric(df["level"], errors='coerce')
    df.index = pd.to_datetime(df.index)

    # Fill NA simplistic via backfill: Passes 7/10 cases
    #df = df.fillna(value=None, method='backfill', axis=None, limit=None, downcast=None)
    # Fill NA via interpolation: 
    df = df.interpolate(method='time', limit_direction="both")

    # Return
    out = "\n".join([str(df.iloc[x].item()) for x in missing_idx])

    print(out)

    return df

In [92]:
DATA = """
1/3/2012 16:00:00   Missing_1
1/4/2012 16:00:00   27.47
1/5/2012 16:00:00   27.728
1/6/2012 16:00:00   28.19
1/9/2012 16:00:00   28.1
1/10/2012 16:00:00  28.15
12/13/2012 16:00:00 27.52
12/14/2012 16:00:00 Missing_19
12/17/2012 16:00:00 27.215
12/18/2012 16:00:00 27.63
12/19/2012 16:00:00 27.73
12/20/2012 16:00:00 Missing_20
12/21/2012 16:00:00 27.49
12/24/2012 13:00:00 27.25
12/26/2012 16:00:00 27.2
12/27/2012 16:00:00 27.09
12/28/2012 16:00:00 26.9
12/31/2012 16:00:00 26.77
"""

In [93]:
readings = (DATA.split("\n")[1:-1])
df = calcMissing(readings)

27.47
27.44375
27.61


In [94]:
df

Unnamed: 0,level
2012-01-03 16:00:00,27.47
2012-01-04 16:00:00,27.47
2012-01-05 16:00:00,27.73
2012-01-06 16:00:00,28.19
2012-01-09 16:00:00,28.1
2012-01-10 16:00:00,28.15
2012-12-13 16:00:00,27.52
2012-12-14 16:00:00,27.44
2012-12-17 16:00:00,27.21
2012-12-18 16:00:00,27.63


#### QB Q1

In [98]:
#!/bin/python3

import math
import os
import random
import re
import sys


# Complete the 'efficientJanitor' function below.
#
# The function is expected to return an INTEGER.
# The function accepts FLOAT_ARRAY weight as parameter.
#

def efficientJanitor(weight):
    """
    Find minimum num of ways to combine each weight to sum up to 3.

    Args:
      weight: List of weights of bags
    
    Returns:
      int: Number of trips required to bring bags outside

    Example:
      Input:
      n = 5
      weight = [1.01, 1.99, 2.5, 1.5, 1.01]
      
      Output:
      3 (since [1.01 + 1.99, 2.5, 1.5+1.01])
    """
    count = 0
    i, j = 0, len(weight) - 1
    weight.sort()
    # Iterate through weights
    while i <= j:
        count += 1
        if weight[i] + weight[j] <= 3:
            i += 1
        j -= 1
    return count

if __name__ == '__main__':
    print(efficientJanitor([1.01, 1.99, 2.5, 1.5, 1.01]))

3


#### QB Q2

In [14]:
#
# Complete the 'encryptionValidity' function below.
#
# The function is expected to return an INTEGER_ARRAY.
# The function accepts following parameters:
#  1. INTEGER instructionCount
#  2. INTEGER validityPeriod
#  3. INTEGER_ARRAY keys
#

EXP = 10**5

def encryptionValidity(instructionCount, validityPeriod, keys):
    """
    Check if hijacker can crack code in validity period.

    Args:
      instuctionCount: number of keys that can be tried per s
      validityPeriod: how long code is valid
      keys: list of keys hijacker can test

    Returns:
      array consisting of:
        can_solve: if code will be cracked by hijacker
        strength: number of keys required to test to break encryption
    """
    # Compute encryption strength
    # Find num with most divisors
    max_div = 1
    for k in keys:
        num_div = 0
        for div in keys:
            if k % div == 0:
                num_div += 1
        if num_div > max_div:
            max_div = num_div
  
    strength = max_div * EXP
    
    # Check if solvable
    can_test = instructionCount * validityPeriod
    can_solve = int(can_test > strength)

    return [can_solve, strength]

In [16]:
encryptionValidity(1000, 10000, [2, 4, 8, 2])

[1, 400000]

#### QB Q3

In [94]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.9-py2.py3-none-any.whl (12 kB)
Collecting scipy==1.5.4
  Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 1.5 MB/s 
[?25hCollecting scikit-learn==0.23.1
  Downloading scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 31.8 MB/s 
[?25hCollecting tqdm==4.56.0
  Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.1 MB/s 
Collecting xgboost==1.1.1
  Downloading xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)
[K     |████████████████████████████████| 127.6 MB 22 kB/s 
[?25hCollecting lightgbm==2.3.1
  Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 53.5 MB/s 
[?25hCollecting PyYAML==5.3.1
  Downloading PyYAML-5.3.1.tar.gz (269 kB)
[K     |████████████████████████████████|

In [None]:
# dummy inputs (one day)
data=[10.0,11.1,12.3,13.2,14.8,15.6,16.7,17.5,18.9,19.7,20.7,21.1,22.6,23.5,24.9,25.1,26.3,27.8,28.8,29.6,30.2,31.6,32.1,33.7,]
startDate = '2013-01-01'
endDate = '2013-01-01'
p = 1
n = 1

In [80]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import SGDRegressor

from sklearn.kernel_ridge import KernelRidge

# TODO:
# Choose better model by testing on more data
# Improve feature encoding & possibly add additional features like temperature difference

def train_model_cv(X, y, split=.67):
    """
    Find the best model with simple cross validation of just 1 fold.
    """
    X_train, X_test= np.split(X, [int(split *len(data))])
    y_train, y_test= np.split(y, [int(split *len(data))])
    reg = LazyRegressor(predictions=True)
    models, predictions = reg.fit(X_train, X_test, y_train, y_test)
    return models

def prepare_df(data):
    """Prepares time series DF for model input"""
    df = pd.DataFrame(data)
    # Prepare features for model input
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    return df


def predictTemperature(startDate, endDate, temperature, n, debug=False):
    """
    Forecast temperature for next n days based on historic temperature.

    Args:
      startDate: first day of temp data, yyyy-mm-dd
      endDate: last day of temp data, yyyy-mm-dd
      temperature: hourly float temp data points for date range
      n: number of days of temperature to predict in the future

    Returns:
      temp: hourly float temp data for n future days, array of shape: [24 x n]

    """
    num_days = int(len(temperature)/24)

    start_date = pd.to_datetime(startDate)
    end_date = pd.to_datetime(endDate) + pd.offsets.Day()
    # Generate date range and cut off the last hour
    date_range = pd.date_range(start_date, end_date, freq='H')[:-1]

    data = {"date": date_range, "temp": temperature}
    df = prepare_df(data)
    
    features = ["month", "hour"]

    X = df[features].values
    y = df["temp"].values

    # TODO: Use sinusoidal encoding for month & hour
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(X)
    X = enc.transform(X).toarray()

    if debug:
      model_df = train_model_cv(X, y)
      return model_df

    model = KernelRidge()
    model.fit(X, y)

    # Predict on future data
    future_hours = n * 24
    fin_date = df.iloc[-1].date + pd.DateOffset(hours=future_hours)
    future_data = {"date": pd.date_range(df.iloc[-1].date, fin_date, freq='H')[:-1]}

    df = prepare_df(future_data)
    X = df[features].values
    X = enc.transform(X).toarray()

    return model.predict(X).tolist()

In [81]:
df = predictTemperature(startDate, endDate, data, n, debug=True)

100%|██████████| 42/42 [00:01<00:00, 26.76it/s]


In [82]:
df

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KernelRidge,70.03,-176.49,30.1,0.01
GaussianProcessRegressor,69.87,-176.09,30.06,0.01
MLPRegressor,46.64,-116.36,24.47,0.12
KNeighborsRegressor,25.35,-61.61,17.88,0.02
LinearSVR,18.88,-44.99,15.32,0.01
PoissonRegressor,13.52,-31.2,12.82,0.01
GammaRegressor,13.35,-30.76,12.73,0.01
SGDRegressor,12.75,-29.2,12.42,0.01
NuSVR,12.52,-28.63,12.3,0.01
PassiveAggressiveRegressor,12.44,-28.42,12.25,0.01


In [86]:
# necessary import libraries
import numpy as np
import datetime,time
from sklearn.linear_model import LinearRegression

# dummy inputs (one day)
data=[10.0,11.1,12.3,13.2,14.8,15.6,16.7,17.5,18.9,19.7,20.7,21.1,22.6,23.5,24.9,25.1,26.3,27.8,28.8,29.6,30.2,31.6,32.1,33.7,]
startDate = '2013-01-01'
endDate = '2013-01-01'
p = 1
n = 1

# utility function
def predictTemperature(startDate, endDate, temperature, n):

    p = int(len(temperature)/24)
    x = []
    for i in range(1,((24*p)+1)):
        x.append(i)
    y = temperature
    lm = LinearRegression()
    lm.fit(np.asarray(x).reshape(-1,1),y)

    print(x)
    
    f = x[-1]+1
    z = []
    for i in range(24*n):
        z.append(f)
        f += 1
    return(lm.predict(np.asarray(z).reshape(-1,1)).tolist())

In [87]:
print(predictTemperature(startDate, endDate, data, n))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
[34.57862318840579, 35.58557971014493, 36.592536231884054, 37.59949275362318, 38.60644927536231, 39.61340579710144, 40.62036231884058, 41.627318840579704, 42.63427536231883, 43.641231884057966, 44.64818840579709, 45.65514492753623, 46.662101449275355, 47.66905797101449, 48.676014492753616, 49.68297101449275, 50.68992753623188, 51.696884057971005, 52.70384057971014, 53.710797101449266, 54.7177536231884, 55.72471014492753, 56.73166666666666, 57.73862318840579]
