# mypy annotations with None 

This comes from https://mypy.readthedocs.io/en/stable/kinds_of_types.html#the-any-type

You should give a statically typed function an explicit None return type even if it doesn’t return a value, as this lets mypy catch additional type errors:

In [1]:
import time 

def wait(t: float):  # Implicit Any return value
    print('Waiting...')
    time.sleep(t)

if wait(2) > 1:   # Mypy doesn't catch this error!
    ...

Waiting...


TypeError: '>' not supported between instances of 'NoneType' and 'int'

If we had used an explicit None return type, mypy would have caught the error:


In [2]:
def wait(t: float) -> None:
    print('Waiting...')
    time.sleep(t)

if wait(2) > 1:   # Error: can't compare None and int
    ...

Waiting...


TypeError: '>' not supported between instances of 'NoneType' and 'int'

# Exceptions & inheritance 

In [3]:
class TimeoutError(Exception):
    pass

In [4]:
raise TimeoutError("Oh no I simply can't do that", "bla")

TimeoutError: ("Oh no I simply can't do that", 'bla')

# Series & DataFrame 

In [19]:
import pandas as pd 
import numpy as np 

In [20]:
s = pd.Series(np.random.rand(10), name="MySeries"); s

0    0.092783
1    0.740249
2    0.096381
3    0.682808
4    0.778577
5    0.113946
6    0.171664
7    0.131936
8    0.391649
9    0.916708
Name: MySeries, dtype: float64

In [21]:
df = pd.DataFrame(s); df.head()

Unnamed: 0,MySeries
0,0.092783
1,0.740249
2,0.096381
3,0.682808
4,0.778577


In [22]:
df.columns[0]

'MySeries'

In [23]:
list(df.columns)

['MySeries']

# Inheritance & Abstract Methods 

In [24]:
from abc import ABC, abstractmethod

def object_repr(object) -> str:
    if object is None:
        return "None"
    cls = object.__class__
    return f"{cls.__module__}.{cls.__name__}"
    
class Explainer(ABC):
    @abstractmethod
    def __init__(self, model, explainer):
        pass

    def __repr__(self) -> str:
        return f"""{self.__class__.__name__} (model={object_repr(self.model)}, explainer={object_repr(self.explainer)})
        """
        
    @abstractmethod
    def fit(self, data) -> None:
        pass

## without implementing fit 

In [25]:
class DiceExplainer(Explainer):
    def __init__(self, model, explainer):
        self.model = model
        self.explainer = explainer

In [26]:
hi = DiceExplainer(1, 2)

TypeError: Can't instantiate abstract class DiceExplainer with abstract method fit

## without implementing init correctly  

Fails because it tries to run the repr method which relies on 'model'

In [27]:
class DiceExplainer(Explainer):
    def __init__(self):
        self.hi = 1
    def fit(data):
        return None

In [28]:
DiceExplainer()

AttributeError: 'DiceExplainer' object has no attribute 'model'

## implemeting fit 

In [53]:
class DiceExplainer(Explainer):
    def __init__(self, model=None, explainer=None):
        self.model = model
        self.explainer = explainer
    def fit(data):
        return None
    

In [54]:
hi = DiceExplainer(1)

In [55]:
hi

DiceExplainer (model=builtins.int, explainer=None)
        

# Pathlib 

In [61]:
from pathlib import Path 

In [62]:
pth = Path("/my/path/to/heaven.py")

In [63]:
Path(pth)

PosixPath('/my/path/to/heaven.py')

# Indexing into pandas columns 

In [64]:
import pandas as pd, numpy as np 

In [65]:
df = pd.DataFrame(np.random.rand(10,10))

In [66]:
df.columns[0]

0

In [67]:
df.drop(columns=[1,4,5])

Unnamed: 0,0,2,3,6,7,8,9
0,0.791725,0.568045,0.925597,0.020218,0.83262,0.778157,0.870012
1,0.978618,0.461479,0.780529,0.143353,0.944669,0.521848,0.414662
2,0.264556,0.45615,0.568434,0.612096,0.616934,0.943748,0.68182
3,0.359508,0.697631,0.060225,0.210383,0.128926,0.315428,0.363711
4,0.570197,0.988374,0.102045,0.653108,0.253292,0.466311,0.244426
5,0.15897,0.65633,0.138183,0.820993,0.097101,0.837945,0.096098
6,0.976459,0.976761,0.604846,0.282807,0.120197,0.29614,0.118728
7,0.317983,0.064147,0.692472,0.523248,0.093941,0.575946,0.929296
8,0.318569,0.131798,0.716327,0.586513,0.020108,0.82894,0.004695
9,0.677817,0.735194,0.962189,0.592042,0.572252,0.223082,0.952749


# Can you reset `self` reference inside a class? 

In [68]:
class MyClass():
    def __init__(self, x: int):
        self.x = x
    def __repr__(self):
        return str(self.x)
    def reset(self, object):
        self = object 

In [69]:
c = MyClass(3)
c.reset(MyClass(4))

In [70]:
c

3

Seems you can't naively reassign self, but python doesn't complain either 

# Pandas indexes 

In [71]:
df = pd.DataFrame(np.random.rand(10,10), index = list(range(0,20,2)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.447125,0.846409,0.699479,0.297437,0.813798,0.396506,0.881103,0.581273,0.881735,0.692532
2,0.725254,0.501324,0.956084,0.64399,0.423855,0.606393,0.019193,0.301575,0.660174,0.290078
4,0.618015,0.428769,0.135474,0.298282,0.569965,0.590873,0.574325,0.653201,0.652103,0.431418
6,0.896547,0.367562,0.435865,0.891923,0.806194,0.703889,0.100227,0.919483,0.714241,0.998847
8,0.149448,0.868126,0.162493,0.61556,0.12382,0.848008,0.807319,0.569101,0.407183,0.069167
10,0.697429,0.453543,0.722056,0.866382,0.975522,0.855803,0.011714,0.359978,0.729991,0.17163
12,0.521037,0.054338,0.199997,0.018522,0.793698,0.223925,0.345352,0.928081,0.704414,0.031839
14,0.164694,0.621478,0.577229,0.237893,0.934214,0.613966,0.535633,0.58991,0.730122,0.311945
16,0.398221,0.209844,0.186193,0.944372,0.739551,0.490459,0.227415,0.254356,0.058029,0.434417
18,0.311796,0.696343,0.377752,0.179604,0.024679,0.06725,0.679393,0.453697,0.536579,0.896671


In [72]:
df.reset_index().sample(5).index

Index([1, 3, 7, 5, 4], dtype='int64')

# Pathlib methods 
what glorious methods are exposed? 

In [73]:
from pathlib import Path 
import os

In [74]:
path = Path("/home/mchristos/code")

In [75]:
files = list(path.glob("*.py")); files 

[]

In [76]:
os.remove(files[0])

IndexError: list index out of range

In [77]:
files = list(path.glob("*.py")); files 

[]

# What does `x[a::b]` do? 

In [78]:
unique = ["chris", "michael", "daniel", "chrisA", "john", "sizwe", "monwabisi"]
n_splits = 4
for i in range(n_splits):
    test_vals = unique[i :: n_splits]
    print(test_vals)

['chris', 'john']
['michael', 'sizwe']
['daniel', 'monwabisi']
['chrisA']


In [79]:
unique[3::4]

['chrisA']

In [80]:
import numpy as np 
class ColumnSplitter:
    """Split by the unique values in a column for cross validation"""

    def __init__(self, column: pd.Series, n_splits: int, shuffle: bool = False, random_state: int | None = None):
        """Split by unique values in a column for cross validation. Independent sets of the unique
        values will be generated for train & test sets.

        Parameters
        ----------
        column : pd.Series
            The column to use for splitting
        n_splits : int
            Number of train/test splits
        shuffle : bool
            Whether to shuffle the list of unique values before
            splitting
        random_state : int, optional
            Seed to use for shuffling if shuffle=True
        """
        if n_splits < 2:
            raise ValueError(f"n_splits must be >= 2, got {n_splits}")
        self.column = column
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def __iter__(self):
        """Iterate over train/test splits, returning the indices for each."""
        unique = self.column.unique()
        if self.shuffle:
            rng = np.random.default_rng(self.random_state)
            rng.shuffle(unique)
        for i in range(self.n_splits):
            test_vals = unique[i :: self.n_splits]
            test_mask = self.column.isin(test_vals)
            test_indices = self.column[test_mask].index
            train_indices = self.column[~test_mask].index
            yield train_indices, test_indices


# cloudpathlib / pathlib `Path` file endings 

In [81]:
# !pip install cloudpathlib 

In [82]:
from cloudpathlib import AnyPath 
from pathlib import Path

In [83]:
path = AnyPath("DiCE/docs/dice_ml.data_interfaces.html")

file name 

In [84]:
path.name

'dice_ml.data_interfaces.html'

file extension

In [85]:
path.suffix

'.html'

In [86]:
path = Path("DiCE/docs/dice_ml.data_interfaces.html")

In [87]:
path.suffix

'.html'

In [88]:
notadir = Path("removeme")

In [89]:
res = notadir.mkdir(exist_ok=True)
print(res)

None


Absolute paths 

In [90]:
notadir.absolute()

PosixPath('/home/mchristos/code/removeme')

parent directories 

In [91]:
notadir.parent.absolute()

PosixPath('/home/mchristos/code')

In [92]:
notadir.exists()

True

# Autoreload 

In [93]:
%reload_ext autoreload
%autoreload 2

# Generating datetimes 

In [94]:
import pandas as pd, numpy as np 

In [95]:
n_examples = 10
n_dates = 2

generator = np.random.default_rng()
start_timestamp = pd.Timestamp('2000-01-01 00:00:00').timestamp()
end_timestamp = pd.Timestamp('2020-12-31 23:59:59').timestamp()
random_timestamps = generator.integers(start_timestamp, end_timestamp, (n_examples, n_dates))
random_datetimes = pd.to_datetime(random_timestamps, unit='s')

In [96]:
list(random_datetimes.astype("str"))

[('2000-10-21 09:56:41', '2008-12-07 14:44:36'),
 ('2013-01-29 19:20:23', '2005-07-12 10:34:12'),
 ('2015-03-10 11:34:47', '2006-06-02 14:37:31'),
 ('2014-04-03 06:44:35', '2015-05-28 04:21:37'),
 ('2007-11-08 16:52:30', '2014-12-08 15:08:41'),
 ('2001-05-03 22:28:28', '2013-11-14 16:14:37'),
 ('2014-06-24 13:13:44', '2002-08-01 09:57:45'),
 ('2010-11-15 21:30:38', '2016-11-15 20:41:27'),
 ('2006-01-21 07:52:15', '2002-07-21 19:57:06'),
 ('2005-09-29 09:22:11', '2020-03-22 15:21:00')]

In [97]:
pd.DataFrame(random_datetimes)

Unnamed: 0,0,1
0,2000-10-21 09:56:41,2008-12-07 14:44:36
1,2013-01-29 19:20:23,2005-07-12 10:34:12
2,2015-03-10 11:34:47,2006-06-02 14:37:31
3,2014-04-03 06:44:35,2015-05-28 04:21:37
4,2007-11-08 16:52:30,2014-12-08 15:08:41
5,2001-05-03 22:28:28,2013-11-14 16:14:37
6,2014-06-24 13:13:44,2002-08-01 09:57:45
7,2010-11-15 21:30:38,2016-11-15 20:41:27
8,2006-01-21 07:52:15,2002-07-21 19:57:06
9,2005-09-29 09:22:11,2020-03-22 15:21:00


In [98]:
rand_timestamps = generator.integers(
    pd.Timestamp("2000-01-01 00:00:00").timestamp(),
    pd.Timestamp("2020-12-31 23:59:59").timestamp(),
    (n_examples, n_dates),
)
dt_features = pd.DataFrame(
    pd.to_datetime(rand_timestamps, unit="s"),
    columns=[f"date_{i}" for i in range(n_dates)],
)
dt_features.shape

(10, 2)

# Generating ids (permutations) 

In [99]:
import numpy as np 

In [100]:
generator = np.random.default_rng()
np.array([generator.permutation(10) for _ in range(3)])

array([[6, 1, 9, 3, 0, 2, 8, 4, 5, 7],
       [6, 8, 7, 5, 0, 1, 4, 3, 2, 9],
       [6, 7, 9, 3, 0, 4, 5, 8, 2, 1]])

In [101]:
n_ids=5
n_examples=10
id_features =  pd.DataFrame(
    np.array([generator.permutation(n_examples) for _ in range(n_ids)]).T,
    columns=[f"id_{i}" for i in range(n_ids)],
)
id_features

Unnamed: 0,id_0,id_1,id_2,id_3,id_4
0,5,5,2,2,2
1,1,4,0,1,1
2,9,8,7,6,4
3,3,3,9,5,7
4,4,2,1,0,9
5,0,7,4,4,6
6,7,0,6,3,5
7,6,6,3,9,8
8,2,1,8,8,3
9,8,9,5,7,0


# pandas masks 😷

In [102]:
import pandas as pd 
import numpy as np 

generator = np.random.default_rng()
nrows =10
ncols = 3
df = pd.DataFrame(np.array([generator.permutation(nrows) for _ in range(ncols)])).T
df

Unnamed: 0,0,1,2
0,7,0,3
1,9,2,4
2,5,5,6
3,6,3,9
4,2,9,5
5,3,7,8
6,4,1,7
7,1,4,1
8,8,6,0
9,0,8,2


In [103]:
df[1]

0    0
1    2
2    5
3    3
4    9
5    7
6    1
7    4
8    6
9    8
Name: 1, dtype: int64

In [104]:
mask = [True] * len(df)
mask &= df[1].notnull().all()

In [105]:
mask

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [106]:
df[mask]

Unnamed: 0,0,1,2
0,7,0,3
1,9,2,4
2,5,5,6
3,6,3,9
4,2,9,5
5,3,7,8
6,4,1,7
7,1,4,1
8,8,6,0
9,0,8,2


# Inspect signature 

In [9]:
import inspect 
from pathlib import Path
from cloudpathlib import AnyPath

In [4]:
def myfunc(a: int, b: str): print(f"{a}: {b}")

In [5]:
inspect.signature??

[0;31mSignature:[0m
[0minspect[0m[0;34m.[0m[0msignature[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mobj[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfollow_wrapped[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mglobals[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlocals[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meval_str[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0msignature[0m[0;34m([0m[0mobj[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mfollow_wrapped[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mglobals[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlocals[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0meval_str[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Get a signature object for the passe

In [6]:
inspect.signature(myfunc)

<Signature (a: int, b: str)>

In [10]:
inspect.signature(AnyPath("/home/mchristos"))

TypeError: PosixPath('/home/mchristos') is not a callable object

# pandas sampling and shuffling 

In [15]:
import pandas as pd 

pd.Series([1,2,3]).sample(5)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [16]:
uris = ["b;a", "bla", "hi"]
max_images=2
if max_images and max_images < len(uris):
    uris = list(pd.Series(uris).sample(max_images, random_state=0))

In [17]:
uris

['hi', 'bla']

In [33]:
uris = ["b;a", "bla", "hi", "gs://", "myuri"]
list(pd.Series(uris).sample(len(uris), random_state=0))[:3]

['hi', 'b;a', 'bla']

In [39]:
import random 
random.shuffle(uris)
uris

['b;a', 'gs://', 'myuri', 'hi', 'bla']

# streaming into a csv file 

In [42]:
import pandas as pd
for data in iter([["Row1", 1], ["Row2", 2], ["Row3", 3]]): pd.DataFrame([data]).to_csv('output.csv', mode='a', header=False, index=False)

# pydantic json serialization 

In [43]:
import pydantic.json import pyda

In [44]:
pydantic.json.ENCODERS_BY_TYPE

AttributeError: module 'pydantic.json' has no attribute 'ENCODERS_BY_TYPE'

# reversing lists

In [1]:
x = [1,2,3]

In [8]:
list(reversed(x))

[3, 2, 1]

In [9]:
x

[1, 2, 3]

# os.environ 

In [10]:
import os 
os.environ.get??

[0;31mSignature:[0m [0mos[0m[0;34m.[0m[0menviron[0m[0;34m.[0m[0mget[0m[0;34m([0m[0mkey[0m[0;34m,[0m [0mdefault[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mget[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mkey[0m[0;34m,[0m [0mdefault[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m'D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None.'[0m[0;34m[0m
[0;34m[0m        [0;32mtry[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;32mreturn[0m [0mself[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m[0m
[0;34m[0m        [0;32mexcept[0m [0mKeyError[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;32mreturn[0m [0mdefault[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/.asdf/installs/python/3.10.12/lib/python3.10/_collections_abc.py
[0;31mType:[0m      method

In [12]:
myvar = os.environ.get("MYVAR") or "hello!"
myvar

'hello!'

# checking if urls exist 

In [13]:
import requests

def is_url_image(image_url):
   image_formats = ("image/png", "image/jpeg", "image/jpg")
   r = requests.head(image_url)
   if r.headers["content-type"] in image_formats:
      return True
   return False

In [18]:
im_url = "https://www.princeton.edu/sites/default/files/styles/1x_full_2x_half_crop/public/images/2022/02/KOA_Nassau_2697x1517.jpg?itok=Bg2K7j7J"

In [19]:
is_url_image(im_url)

True

In [15]:
is_url_image("https://www.princeton.edu/fake.png")

False

In [16]:
requests.head("https://www.princeton.edu/fake.png")

<Response [404]>

In [17]:
requests.head??

[0;31mSignature:[0m [0mrequests[0m[0;34m.[0m[0mhead[0m[0;34m([0m[0murl[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mhead[0m[0;34m([0m[0murl[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Sends a HEAD request.[0m
[0;34m[0m
[0;34m    :param url: URL for the new :class:`Request` object.[0m
[0;34m    :param \*\*kwargs: Optional arguments that ``request`` takes. If[0m
[0;34m        `allow_redirects` is not provided, it will be set to `False` (as[0m
[0;34m        opposed to the default :meth:`request` behavior).[0m
[0;34m    :return: :class:`Response <Response>` object[0m
[0;34m    :rtype: requests.Response[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mkwargs[0m[0;34m.[0m[0msetdefault[0m[0;34m([0m[0;34m"allow_redirects"[0m[0;34m,[0m [0;32mFalse[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [

In [22]:
from cloudpathlib import AnyPath

In [24]:
p = AnyPath(im_url)

In [25]:
p

PosixPath('https:/www.princeton.edu/sites/default/files/styles/1x_full_2x_half_crop/public/images/2022/02/KOA_Nassau_2697x1517.jpg?itok=Bg2K7j7J')

In [26]:
p.exists()

False

In [None]:
p.startwith()

In [6]:
lst = 
[*dict.fromkeys(['a','b','c', 'a', 'c'])]

['a', 'b', 'c']