In [2]:
import random

def randomize_in_place(A,n):
    for i in range(n):
        rand_int = random.randint(i,n-1)
        A[i], A[rand_int] = A[rand_int], A[i]
    return A

def random_sample_in_place(n,k):
    A = [i for i in range(n)]
    n = len(A)
    randomize_in_place(A,n)
    return A[:k]

random_sample_in_place(n=100,k=10)

[85, 66, 59, 97, 38, 46, 3, 27, 26, 58]

In [5]:
import numpy as np

def random_sample_rec(n, k):  # from Cormen et al.
    if k==0:
        return set()
    else:
        S = random_sample_rec(n-1, k-1)
        i = np.random.randint(1,n+1)
        if i in S:
            S = S.union([n])
        if i not in S:
            S = S.union([i])
    return S

random_sample_rec(100,10)

{7, 9, 25, 44, 46, 63, 74, 85, 95, 96}

In [6]:
def random_sample_optimal(n,k):
    S = {}
    j = 1
    while j <= k:
        i = np.random.randint(1,n+1)
        if i not in S:
            S[i] = i
            j += 1
    return list(S)

random_sample_optimal(100,10)

[46, 61, 50, 47, 88, 30, 31, 74, 81, 90]

In [7]:
import timeit

%timeit random_sample_rec(100000,1000)
%timeit random_sample_optimal(100000,1000)
%timeit random_sample_in_place(100000,1000)

10.3 ms ± 404 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.4 ms ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
89.8 ms ± 2.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### (a) Write a code that restricts this table to a ~10% random sample of the customers (with all their calls). Build the code as fast as possible, and do not use sample() (or similar) functions.

In [166]:
import numpy as np

def random_sample(filename,percentage):
    with open(filename,"r") as file:
        data = file.read().splitlines()
    n = len(data)
    m = int(n * percentage)
    S = {}
    j = 0
    while j < m:
        i = np.random.randint(0,n)
        if i not in S:
            S[i] = data[i]
            j += 1
    return S

myDict = random_sample("TABLE_A.csv",0.1)
myDict

{1013371: '+6554660811,11/14/21 12:00 AM,1,19,0.19',
 758939: '+6545737971,4/9/12 12:00 AM,0,7,0.07',
 638672: '+6531734644,9/2/12 12:00 AM,1,17,0.17',
 500984: '+6575287790,8/3/13 12:00 AM,0,59,0.59',
 365355: '+6560877577,10/19/18 12:00 AM,1,134,1.34',
 547695: '+6516164308,5/11/11 12:00 AM,1,41,0.41',
 540926: '+6577425493,5/11/22 12:00 AM,0,72,0.72',
 431994: '+6583933896,11/28/19 12:00 AM,0,127,1.27',
 536671: '+6537562702,6/28/17 12:00 AM,0,147,1.47',
 981143: '+6558374175,4/9/19 12:00 AM,0,146,1.46',
 774873: '+6516333351,11/26/16 12:00 AM,0,70,0.7',
 413575: '+6561938097,9/15/11 12:00 AM,1,60,0.6',
 821363: '+6538975868,12/1/14 12:00 AM,0,123,1.23',
 845202: '+6569371355,9/27/15 12:00 AM,0,74,0.74',
 393171: '+6595020341,4/8/12 12:00 AM,1,70,0.7',
 369724: '+6541026081,1/10/12 12:00 AM,1,117,1.17',
 722951: '+6516932427,8/5/16 12:00 AM,1,71,0.71',
 677003: '+6512398105,6/28/21 12:00 AM,0,143,1.43',
 846712: '+6570276651,10/16/17 12:00 AM,0,99,0.99',
 649276: '+6591309196,10/31/

### (b) Write a code to calculate the median duration for every day. Don't use median() as a function. 

In [1451]:
def random_partition(A,p,r):
    rand_num = np.random.randint(p,r+1)
    x = A[rand_num]
    A[r], A[rand_num] = A[rand_num], A[r]
    i = p-1
    for j in range(p,r):
        if A[j] <= x:
            i += 1
            A[j], A[i] = A[i], A[j]
    A[i+1], A[r] = A[r], A[i+1]
    return i+1

def random_select(A,p,r,k):
    if p == r:
        return A[p]
    q = random_partition(A,p,r)
    if k == q:
        return A[k]
    elif k < q:
        return random_select(A,p,q-1,k)
    else:
        return random_select(A,q+1,r,k)

In [1457]:
from datetime import datetime

def call_duration_median(myDict):
    dur = {}
    for e in (myDict.values()):
        x = e.split(",")
        date = datetime.strptime(x[1],'%m/%d/%y %I:%M %p').date()
        # date = datetime.strptime(x[1],'%m-%d-%y %H:%M:%S').date() #exam format
        if date in dur:
            dur[date] += [int(x[3])]
        else:
            dur[date] = [int(x[3])]
    for i,val in dur.items():
        dur[i] = random_select(val,0,len(val)-1,len(val)//2)
    return dur

call_duration_median(myDict)

{datetime.date(2021, 11, 14): 74,
 datetime.date(2012, 4, 9): 76,
 datetime.date(2012, 9, 2): 98,
 datetime.date(2013, 8, 3): 76,
 datetime.date(2018, 10, 19): 104,
 datetime.date(2011, 5, 11): 90,
 datetime.date(2022, 5, 11): 82,
 datetime.date(2019, 11, 28): 94,
 datetime.date(2017, 6, 28): 67,
 datetime.date(2019, 4, 9): 100,
 datetime.date(2016, 11, 26): 78,
 datetime.date(2011, 9, 15): 67,
 datetime.date(2014, 12, 1): 93,
 datetime.date(2015, 9, 27): 62,
 datetime.date(2012, 4, 8): 78,
 datetime.date(2012, 1, 10): 71,
 datetime.date(2016, 8, 5): 71,
 datetime.date(2021, 6, 28): 122,
 datetime.date(2017, 10, 16): 47,
 datetime.date(2011, 10, 31): 77,
 datetime.date(2012, 9, 4): 56,
 datetime.date(2014, 9, 13): 76,
 datetime.date(2020, 3, 17): 78,
 datetime.date(2011, 12, 26): 54,
 datetime.date(2017, 11, 25): 86,
 datetime.date(2022, 1, 31): 89,
 datetime.date(2016, 1, 22): 82,
 datetime.date(2018, 9, 21): 79,
 datetime.date(2014, 4, 14): 68,
 datetime.date(2011, 8, 7): 78,
 dateti