# Python for data analysis - O'Reilly

Testing / learning of the code in the book.

#### 1. Basic Jupyter notebook tips and tricks

In [1]:
%quickref


IPython -- An enhanced Interactive Python - Quick Reference Card

obj?, obj??      : Get help, or more help for object (also works as
                   ?obj, ??obj).
?foo.*abc*       : List names in 'foo' containing 'abc' in them.
%magic           : Information about IPython's 'magic' % functions.

Magic functions are prefixed by % or %%, and typically take their arguments
without parentheses, quotes or even commas for convenience.  Line magics take a
single % and cell magics are prefixed with two %%.

Example magic function calls:

%alias d ls -F   : 'd' is now an alias for 'ls -F'
alias d ls -F    : Works if 'alias' not a python name
alist = %alias   : Get list of aliases to 'alist'
cd /usr/share    : Obvious. cd -<tab> to choose from visited dirs.
%cd??            : See help AND source for magic %cd
%timeit x=10     : time the 'x=10' statement with high precision.
%%timeit x=2**100
x**100           : time 'x**100' with a setup of 'x=2**100'; setup code is not
                   co

In [2]:
b = []
b.append??

[1;31mSignature:[0m [0mb[0m[1;33m.[0m[0mappend[0m[1;33m([0m[0mobject[0m[1;33m,[0m [1;33m/[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m Append object to the end of the list.
[1;31mType:[0m      builtin_function_or_method

In [3]:
b.*?

b.__add__
b.__class__
b.__class_getitem__
b.__contains__
b.__delattr__
b.__delitem__
b.__dir__
b.__doc__
b.__eq__
b.__format__
b.__ge__
b.__getattribute__
b.__getitem__
b.__getstate__
b.__gt__
b.__hash__
b.__iadd__
b.__imul__
b.__init__
b.__init_subclass__
b.__iter__
b.__le__
b.__len__
b.__lt__
b.__mul__
b.__ne__
b.__new__
b.__reduce__
b.__reduce_ex__
b.__repr__
b.__reversed__
b.__rmul__
b.__setattr__
b.__setitem__
b.__sizeof__
b.__str__
b.__subclasshook__
b.append
b.clear
b.copy
b.count
b.extend
b.index
b.insert
b.pop
b.remove
b.reverse
b.sort

In [7]:
%run test.py

In [8]:
# now result (a variable from test.py) is available in the namespace
result

15

In [5]:
%run? # -i is a useful one. You can pass the notebook's namespace to the file

[1;31mDocstring:[0m
Run the named file inside IPython as a program.

Usage::

  %run [-n -i -e -G]
       [( -t [-N<N>] | -d [-b<N>] | -p [profile options] )]
       ( -m mod | filename ) [args]

The filename argument should be either a pure Python script (with
extension ``.py``), or a file with custom IPython syntax (such as
magics). If the latter, the file can be either a script with ``.ipy``
extension, or a Jupyter notebook with ``.ipynb`` extension. When running
a Jupyter notebook, the output from print statements and other
displayed objects will appear in the terminal (even matplotlib figures
will open, if a terminal-compliant backend is being used). Note that,
at the system command line, the ``jupyter run`` command offers similar
functionality for executing notebooks (albeit currently with some
differences in supported options).

Parameters after the filename are passed as command-line arguments to
the program (put in sys.argv). Then, control returns to IPython's
prompt.

This 

>%load test.py turns itself into a comment and loads the file in the cell

In [9]:
# %load test.py
def f(x,y,z):
    return x + y + z

a, b, c = 4,5,6

result = f(a,b,c)

In [10]:
%paste

UsageError: Line magic function `%cpaste` not found.


In [11]:
%pdb?

[1;31mDocstring:[0m
Control the automatic calling of the pdb interactive debugger.

Call as '%pdb on', '%pdb 1', '%pdb off' or '%pdb 0'. If called without
argument it works as a toggle.

When an exception is triggered, IPython can optionally call the
interactive pdb debugger after the traceback printout. %pdb toggles
this feature on and off.

The initial state of this feature is set in your configuration
file (the option is ``InteractiveShell.pdb``).

If you want to just activate the debugger AFTER an exception has fired,
without having to type '%pdb on' and rerunning your code, you can use
the %debug magic.
[1;31mFile:[0m      c:\users\johan\appdata\roaming\python\python312\site-packages\ipython\core\magics\execution.py

In [2]:
%pdb?

[1;31mDocstring:[0m
Control the automatic calling of the pdb interactive debugger.

Call as '%pdb on', '%pdb 1', '%pdb off' or '%pdb 0'. If called without
argument it works as a toggle.

When an exception is triggered, IPython can optionally call the
interactive pdb debugger after the traceback printout. %pdb toggles
this feature on and off.

The initial state of this feature is set in your configuration
file (the option is ``InteractiveShell.pdb``).

If you want to just activate the debugger AFTER an exception has fired,
without having to type '%pdb on' and rerunning your code, you can use
the %debug magic.
[1;31mFile:[0m      c:\users\johan\appdata\roaming\python\python312\site-packages\ipython\core\magics\execution.py

In [6]:
# show the variables defined in the namespace
%who

matplotlib	 


In [7]:
%hist

%pdb>
%pdb?
%who
%whos
%who_ls
# show the variables defined in the namespace
%who
%hist


In [None]:
%matplotlib inline

In [58]:
%xmode?

[1;31mDocstring:[0m
Switch modes for the exception handlers.

Valid modes: Plain, Context, Verbose, and Minimal.

If called without arguments, acts as a toggle.

When in verbose mode the value `--show` (and `--hide`)
will respectively show (or hide) frames with ``__tracebackhide__ =
True`` value set.
[1;31mFile:[0m      c:\users\johan\appdata\roaming\python\python312\site-packages\ipython\core\magics\basic.py

### 2. Advanced list operations

In [60]:
b = [1,2,3,4]
c = b.copy()
c

[1, 2, 3, 4]

In [61]:
c.insert(3, 'a')
c

[1, 2, 3, 'a', 4]

In [26]:
import bisect

bisect.insort(b, 3)
b

[1, 2, 3, 3, 3, 3, 4]

#### 2.1 dict operations

To avoid *having to check* whether a key exists before adding to it.

1. setdefault

In [35]:
from collections import defaultdict
words = ['hello', 'world']

In [29]:
by_letter = {}

for word in words:
    letter = word[0]
    by_letter.setdefault(letter, []).append(word)
    
by_letter

{'h': ['hello'], 'w': ['world']}

2. defaultdict

In [32]:
by_letter = defaultdict(list)   # instead of just and empty dict

for word in words:
    letter = word[0]
    by_letter['letter'].append(word)
    
by_letter

defaultdict(list, {})

### 3. set operations

In [36]:
a = {1,2,3,4}
b = {4,5,6,7}

a.union(b)  # a | b

{1, 2, 3, 4, 5, 6, 7}

In [45]:
a |= b  # set as the union (a.update(b))
a

{1, 2, 3, 4, 5, 6, 7}

In [37]:
a

{1, 2, 3, 4}

In [39]:
a & b   # a.intersection(b)

{4}

In [40]:
a ^ b

{1, 2, 3, 5, 6, 7}

In [46]:
a - b   # a.difference(b)     # elements in a that are not in b (or a - b)


{1, 2, 3}

In [42]:
a.issubset(b)   # a <= b

False

In [43]:
c = {4,5}
c.issubset(b)

True

In [48]:
b >= c  # b.issuperset(c)

True

### 4. itertools

`groupby` takes any sequence and a function

In [55]:
from itertools import groupby

# a function
letter = lambda x: x[0]

# a sequence
names = ['me', 'you', 'your guitar']

for l, n in groupby(names, letter):
    print(l, list(n))   # n is a generator

m ['me']
y ['you', 'your guitar']


itertools also has `combinations` and `permutations`, which are similar to SQL `CROSS JOIN`

### 5. Numpy

Also see [this (path)](</C:/\Users\johan\OneDrive\/_Msc Data Science\ML\Topic 1 - introduction\1.3 Introduction to NumPy.ipynb>) notebook from the ML module.

In [3]:
import numpy as np

In [65]:
data = np.array([1,2,3])

data.shape, data.dtype

((3,), dtype('int32'))

In [66]:
data * 2

array([2, 4, 6])

In [68]:
data2 = np.array([3,4,5])
data2

array([3, 4, 5])

In [70]:
data2 < data

array([False, False, False])

In [71]:
data + data2

array([4, 6, 8])

In [73]:
data_comb = np.append(data, data2)
data_comb

array([1, 2, 3, 3, 4, 5])

In [75]:
data_comb[1] = 8
data_comb

array([1, 8, 3, 3, 4, 5])

In [77]:
data3 = np.ones((2,3))
data3

array([[1., 1., 1.],
       [1., 1., 1.]])

In [79]:
data3[:,1]

array([1., 1.])

In [85]:
np.linspace(1,10,6)

array([ 1. ,  2.8,  4.6,  6.4,  8.2, 10. ])

In [103]:
data4 = np.linspace(0,10,6)

data5 = np.arange(5)

np.append(data4, data5)

array([ 0.,  2.,  4.,  6.,  8., 10.,  0.,  1.,  2.,  3.,  4.])

In [88]:
np.random.randint?

[1;31mDocstring:[0m
randint(low, high=None, size=None, dtype=int)

Return random integers from `low` (inclusive) to `high` (exclusive).

Return random integers from the "discrete uniform" distribution of
the specified dtype in the "half-open" interval [`low`, `high`). If
`high` is None (the default), then results are from [0, `low`).

.. note::
    New code should use the `~numpy.random.Generator.integers`
    method of a `~numpy.random.Generator` instance instead;
    please see the :ref:`random-quick-start`.

Parameters
----------
low : int or array-like of ints
    Lowest (signed) integers to be drawn from the distribution (unless
    ``high=None``, in which case this parameter is one above the
    *highest* such integer).
high : int or array-like of ints, optional
    If provided, one above the largest (signed) integer to be drawn
    from the distribution (see above for behavior if ``high=None``).
    If array-like, must contain integer values
size : int or tuple of ints, option

In [121]:
np.random.randint([(500, 2000, 10), (500, 2000, 10), (500, 2000, 10)])

array([[  26, 1238,    9],
       [ 384,  148,    9],
       [ 144,  651,    7]])

In [116]:
np.random.random((6,4))

array([[0.65244259, 0.20139562, 0.32477412, 0.2438529 ],
       [0.69962823, 0.09467057, 0.72148272, 0.4088294 ],
       [0.02970666, 0.96752921, 0.64924763, 0.20751615],
       [0.25852565, 0.66913919, 0.1595365 , 0.77841073],
       [0.02307273, 0.12778814, 0.02210501, 0.95007693],
       [0.45824661, 0.10936593, 0.33580529, 0.38414686]])

In [127]:
data6 = np.arange(5)
data7 = np.arange(5, 10)
assert data6.shape == data7.shape, 'oops'
np.concatenate((data6, data7))

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

`((,))` in `np.concatenate((a, b))` or get `TypeError: only integer scalar arrays can be converted to a scalar index`

In [131]:
condition = data6 == data7
~condition

array([ True,  True,  True,  True,  True])

In [132]:
data7[~condition]

array([5, 6, 7, 8, 9])

In [129]:
data6[(data6 != data7) & (data6 < data7)]   # similarly use | as `or`

array([0, 1, 2, 3, 4])

In [134]:
# useful: set all negative values to 0
data6[data6 < 0] = 0 
data6

array([0, 1, 2, 3, 4])

In [136]:
# create a new matrix
matrix = np.random.randn(5,3)
matrix

array([[-1.25740346, -1.4664107 ,  0.59698801],
       [ 0.05073633, -0.77984465, -0.2145418 ],
       [ 0.58265328, -1.27225461,  0.07018141],
       [ 0.20705036,  0.64431369, -1.39768209],
       [-1.2198101 ,  0.7407631 ,  1.76050149]])

In [137]:
# reshape into other dimensions
matrix.reshape((3,5))

array([[-1.25740346, -1.4664107 ,  0.59698801,  0.05073633, -0.77984465],
       [-0.2145418 ,  0.58265328, -1.27225461,  0.07018141,  0.20705036],
       [ 0.64431369, -1.39768209, -1.2198101 ,  0.7407631 ,  1.76050149]])

In [138]:
matrix.T

array([[-1.25740346,  0.05073633,  0.58265328,  0.20705036, -1.2198101 ],
       [-1.4664107 , -0.77984465, -1.27225461,  0.64431369,  0.7407631 ],
       [ 0.59698801, -0.2145418 ,  0.07018141, -1.39768209,  1.76050149]])

In [139]:
matrix ** 2

array([[1.58106347e+00, 2.15036035e+00, 3.56394685e-01],
       [2.57417566e-03, 6.08157678e-01, 4.60281860e-02],
       [3.39484841e-01, 1.61863180e+00, 4.92543065e-03],
       [4.28698523e-02, 4.15140125e-01, 1.95351521e+00],
       [1.48793668e+00, 5.48729974e-01, 3.09936551e+00]])

In [140]:
np.maximum(matrix)

TypeError: maximum() takes from 2 to 3 positional arguments but 1 were given

In [142]:
matrix

array([[-1.25740346, -1.4664107 ,  0.59698801],
       [ 0.05073633, -0.77984465, -0.2145418 ],
       [ 0.58265328, -1.27225461,  0.07018141],
       [ 0.20705036,  0.64431369, -1.39768209],
       [-1.2198101 ,  0.7407631 ,  1.76050149]])

In [148]:
np.where(matrix > 0, 'foo', 'bar')     # if then logic

array([['bar', 'bar', 'foo'],
       ['foo', 'bar', 'bar'],
       ['foo', 'bar', 'foo'],
       ['foo', 'foo', 'bar'],
       ['bar', 'foo', 'foo']], dtype='<U3')

In [155]:
foobar = np.where(matrix > 0, 'foo', 'bar')
np.unique(foobar)

array(['bar', 'foo'], dtype='<U3')

In [152]:
(matrix > 0).sum()  # number of positive values

8

In [151]:
np.abs(matrix).mean()

0.8174090066390911

In [153]:
np.argmax(matrix)   # index of max

14

In [4]:
# easiest way to create a matrix with unique numbers
np.arange(9).reshape((3,3))

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])