In [1]:
# The following is to know when this notebook has been run and with which python version.
import time, sys
print(time.ctime())
print(sys.version.split('|')[0])

Mon Mar 24 17:49:03 2025
3.12.9 


In [14]:
import numpy as np
import scipy 

# F Some useful libraries that make life easier

This is part of the Python lecture given by Christophe Morisset at IA-UNAM.

This lecture will give some insights to the most useful python libraries. It is NOT exhaustive, you have to read the corresponding manual pages to find the best use you can have of them. The list of all python-included libraries is here: https://docs.python.org/3/library/

### time, datetime and timeit

*  https://docs.python.org/3/library/time.html
*  https://docs.python.org/3/library/datetime.html

In [3]:
import time
import datetime

In [4]:
print(time.ctime()) # current time, in a string format

Mon Mar 24 17:49:05 2025


In [5]:
lt = time.localtime()
print(lt)

time.struct_time(tm_year=2025, tm_mon=3, tm_mday=24, tm_hour=17, tm_min=49, tm_sec=6, tm_wday=0, tm_yday=83, tm_isdst=0)


In [6]:
time.strftime("%a, %d %b %Y %H:%M:%S",lt)

'Mon, 24 Mar 2025 17:49:06'

In [7]:
print(datetime.datetime.today())

2025-03-24 17:49:11.377235


In [8]:
today = datetime.date.today()
eclipse_total = datetime.date(2024, 4, 8)

In [9]:
time_to_eclipse = eclipse_total - today

In [10]:
if eclipse_total > today:
    print('Eclipse in {} days.'.format(time_to_eclipse.days))
else:
    print('Eclipse {} days ago.'.format(-time_to_eclipse.days))

Eclipse 350 days ago.


In [17]:
start = time.time()
for i in np.arange(1000):
    t = scipy.special.factorial(i)
end = time.time()
print('1000 factorials done in {0:.3f} secs.'.format(end-start))

1000 factorials done in 2.750 secs.


In [21]:
from timeit import Timer
command = """\
for i in range(1000):
    t = scipy.special.factorial(i)
"""
t = Timer(command, setup='import scipy')
print(t.timeit(number=10))

0.38237258698791265


in ipython, one can use the magic timeit function:

In [22]:
%timeit scipy.special.factorial(50)

15.2 μs ± 488 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


or for the whole cell:

In [23]:
%%timeit
for i in np.arange(1000):
    t =  scipy.special.factorial(i)

12.3 ms ± 67.8 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### os

https://docs.python.org/3/library/os.html 
This module provides a portable way of using operating system dependent functionality. If you just want to read or write a file see open(), if you want to manipulate paths, see the os.path module, and if you want to read all the lines in all the files on the command line see the fileinput module. For creating temporary files and directories see the tempfile module, and for high-level file and directory handling see the shutil module.

In [24]:
import os

In [25]:
# os.environ is a dictionnary containing the environment variables and values
if 'HOME' in os.environ:
    print(os.environ['HOME'])
else:
    print('This OS is too limited to understand HOME')

/Users/christophemorisset


In [26]:
# if variable doesn't exists, getenv returns None
print(os.getenv('HOMMMMMME'))

None


In [27]:
# get current working directory
cwd = os.getcwd()
print(cwd)
print(cwd.split('/')[-1])

/Users/christophemorisset/Google Drive/Pro/Python-MySQL/Python-lectures-Notebooks/Notebooks
Notebooks


In [28]:
new_dir = '/tmp/test3457'
if not os.path.exists(new_dir):
    os.mkdir(new_dir)
    print('Dir {} created'.format(new_dir))
else:
    print('Dir {} exists'.format(new_dir))

Dir /tmp/test3457 created


In [29]:
# list all the files from a directory
os.listdir(new_dir)

[]

In [30]:
os.chdir(new_dir)
os.getcwd()

'/private/tmp/test3457'

In [35]:
# Renaming files in a directory
os.chdir('/tmp')
open('a.txt', 'a').close()
open('b.txt', 'a').close()
open('c.text', 'a').close()
for filename in os.listdir('./'):
    #print(filename)
    base_file, ext = os.path.splitext(filename)
    #print(base_file, ext)
    if ext == '.txt':
        newname = base_file + '.exe'
        try:
            os.rename(filename, newname)
            print('file {0} renamed to {1}'.format(filename, newname))
        except:
            print('file {0} NOT renamed'.format(filename))
print('-'*20)
!ls
os.chdir(cwd)

file b.txt renamed to b.exe
file a.txt renamed to a.exe
--------------------
[32mAcrobat-684e4b52b2b80a7b410a0f9d67cc3b9b[m[m
[34mFirefox-Cache[m[m
MozillaUpdateLock-2656FF1E876E9973
UpdateLock-712DAF5A23ABECDC
a.exe
b.exe
c.text
[34mcom.adobe.Acrobat.tmp.sbx[m[m
[34mcom.apple.launchd.SHwYMhf4LH[m[m
[32mdrivefs_ipc.501[m[m
[32mdrivefs_ipc.501_shell[m[m
[34mpowerlog[m[m
[34mtest3457[m[m


### glob

https://docs.python.org/3/library/glob.html

No tilde expansion is done, but *, ?, and character ranges expressed with [] will be correctly matched.

In [36]:
from glob import glob

In [37]:
os.chdir(cwd)

In [38]:
cwd

'/Users/christophemorisset/Google Drive/Pro/Python-MySQL/Python-lectures-Notebooks/Notebooks'

In [39]:
ipynb_files = glob('*.ipynb')
print(ipynb_files)

['Calling Fortran.ipynb', 'Galaxies_classification.ipynb', 'Test-Install.ipynb', 'Using_SQLalchemy.ipynb', 'intro_Scipy.ipynb', 'ANN.ipynb', 'Ex1_with_res.ipynb', 'Parallel.ipynb', 'OOP.ipynb', 'Using_astropy.ipynb', 'Tarea_1.ipynb', 'Interact with files.ipynb', 'Useful_libraries.ipynb', 'intro_numpy.ipynb', 'intro_Python.ipynb', 'Redshifts_ai4neb.ipynb', 'Ex1_done.ipynb', 'Ex1.ipynb', 'intro_Matplotlib.ipynb', 'Optimization.ipynb', 'Evaluacion.ipynb', 'Redshifts.ipynb', 'Using_astroquery.ipynb']


glob is also included in pathlib, better use it from there:

### pathlib

In [42]:
from pathlib import Path

In [43]:
new_dir = Path('/tmp/test1236')
print(new_dir)
print(new_dir.name)
print(new_dir.parent)

/tmp/test1236
test1236
/tmp


In [44]:
new_dir = Path('/tmp') / Path('test1236')
print(new_dir)
print(new_dir.name)
print(new_dir.parent)

/tmp/test1236
test1236
/tmp


In [45]:
if not new_dir.exists():
    new_dir.mkdir() # may use exist_ok=True
    print('Dir {} created'.format(new_dir))
else:
    print('Dir {} exists'.format(new_dir))

Dir /tmp/test1236 created


In [46]:
current = Path('.')
for f in current.glob('*.ipynb'):
    print(f"{f} is {f.stem} + {f.suffix}")

Calling Fortran.ipynb is Calling Fortran + .ipynb
Galaxies_classification.ipynb is Galaxies_classification + .ipynb
Test-Install.ipynb is Test-Install + .ipynb
Using_SQLalchemy.ipynb is Using_SQLalchemy + .ipynb
intro_Scipy.ipynb is intro_Scipy + .ipynb
ANN.ipynb is ANN + .ipynb
Ex1_with_res.ipynb is Ex1_with_res + .ipynb
Parallel.ipynb is Parallel + .ipynb
OOP.ipynb is OOP + .ipynb
Using_astropy.ipynb is Using_astropy + .ipynb
Tarea_1.ipynb is Tarea_1 + .ipynb
Interact with files.ipynb is Interact with files + .ipynb
Useful_libraries.ipynb is Useful_libraries + .ipynb
intro_numpy.ipynb is intro_numpy + .ipynb
intro_Python.ipynb is intro_Python + .ipynb
Redshifts_ai4neb.ipynb is Redshifts_ai4neb + .ipynb
Ex1_done.ipynb is Ex1_done + .ipynb
Ex1.ipynb is Ex1 + .ipynb
intro_Matplotlib.ipynb is intro_Matplotlib + .ipynb
Optimization.ipynb is Optimization + .ipynb
Evaluacion.ipynb is Evaluacion + .ipynb
Redshifts.ipynb is Redshifts + .ipynb
Using_astroquery.ipynb is Using_astroquery + .ipyn

In [65]:
new_dir = Path('/tmp') / Path('test1236')
for f in new_dir.glob('test*.txt*'):
    f.unlink() # remove file

f = new_dir / Path('test1234.txt')
f.touch()
f = new_dir / Path('test5678.txt')
f.touch()
print([f.name for f in new_dir.glob('test*.txt*')])

for f in new_dir.glob('test*.txt'):
    new_f = f.with_suffix('.txt2')
    if not new_f.exists():
        f.rename(new_f)
    else:
        print('{} exists'.format(new_f))

print([f.name for f in new_dir.glob('test*.txt*')])

['test5678.txt', 'test1234.txt']
['test1234.txt2', 'test5678.txt2']


### sys

https://docs.python.org/3/library/sys.html
This module provides access to some variables used or maintained by the interpreter and to functions that interact strongly with the interpreter. It is always available.

The sys.argv list contains the arguments passed to the script, when the interpreter was started. The first item contains the name of the script itself.

In [66]:
%%writefile sys_text.py
import sys
for arg in sys.argv:
    print(arg)


Overwriting sys_text.py


In [67]:
!cat sys_text.py

import sys
for arg in sys.argv:
    print(arg)


In [68]:
! python sys_text.py tralala trololo

sys_text.py
tralala
trololo


In [73]:
%%writefile fact.py
import sys
import scipy
print(scipy.special.factorial(int(sys.argv[1])))

Overwriting fact.py


In [74]:
! python fact.py 60

8.32098711274139e+81


A more complete managment of the argument (especially those like --v) is available using the argparse library. https://docs.python.org/3/library/argparse.html

### Talking to the OS subprocess

The os.popen method is deprecated, subprocess is prefered. A good tutorial is here: http://pymotw.com/3/subprocess/

In [75]:
import subprocess

In [84]:
var0 = subprocess.run(['ls', '-l'])

total 134136
-rw-------  1 christophemorisset  staff    735972 Feb 11 09:43 ANN.ipynb
-rw-------  1 christophemorisset  staff    173771 Feb 11 09:43 ANN_1_2.png
-rw-------  1 christophemorisset  staff     20137 Feb 11 09:43 Calling Fortran.ipynb
-rw-------  1 christophemorisset  staff     86815 Feb 11 09:43 Calling Fortran.pdf
-rw-------  1 christophemorisset  staff   3600128 Feb 11 09:43 DR9_photo-z-small.npy
-rw-------  1 christophemorisset  staff       217 Feb 25 12:46 Demo.pickle
-rw-------  1 christophemorisset  staff       194 Feb 25 12:45 Demo.pklz
-rw-------  1 christophemorisset  staff       445 Feb 11 09:43 Evaluacion.ipynb
-rw-------  1 christophemorisset  staff      7717 Feb 11 09:43 Ex1.ipynb
-rw-------  1 christophemorisset  staff    381664 Feb 11 09:43 Ex1_done.ipynb
-rw-------  1 christophemorisset  staff    315341 Feb 11 09:43 Ex1_with_res.ipynb
-rw-------  1 christophemorisset  staff    120910 Mar 11 11:13 Fig1.pdf
-rw-------  1 christophemorisset  staff     23342 Feb

In [85]:
var0 = subprocess.run(['ls', '-l'], capture_output=True, text=True)

In [86]:
print(var0.stdout)

total 134120
-rw-------  1 christophemorisset  staff    735972 Feb 11 09:43 ANN.ipynb
-rw-------  1 christophemorisset  staff    173771 Feb 11 09:43 ANN_1_2.png
-rw-------  1 christophemorisset  staff     20137 Feb 11 09:43 Calling Fortran.ipynb
-rw-------  1 christophemorisset  staff     86815 Feb 11 09:43 Calling Fortran.pdf
-rw-------  1 christophemorisset  staff   3600128 Feb 11 09:43 DR9_photo-z-small.npy
-rw-------  1 christophemorisset  staff       217 Feb 25 12:46 Demo.pickle
-rw-------  1 christophemorisset  staff       194 Feb 25 12:45 Demo.pklz
-rw-------  1 christophemorisset  staff       445 Feb 11 09:43 Evaluacion.ipynb
-rw-------  1 christophemorisset  staff      7717 Feb 11 09:43 Ex1.ipynb
-rw-------  1 christophemorisset  staff    381664 Feb 11 09:43 Ex1_done.ipynb
-rw-------  1 christophemorisset  staff    315341 Feb 11 09:43 Ex1_with_res.ipynb
-rw-------  1 christophemorisset  staff    120910 Mar 11 11:13 Fig1.pdf
-rw-------  1 christophemorisset  staff     23342 Feb

If you do not want to wait untill the command finishes, use Popen. It starts the command, and communicate() is transmiting the response of the command.

In [88]:
command = 'ls -l'
popen = subprocess.Popen(command, shell=True)
print(popen)

<Popen: returncode: None args: 'ls -l'>


In [89]:
popen = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
res = popen.communicate()[0]
print(type(res))
print('-'*20)
print(res)
print('-'*20)
print(res.decode())

<class 'bytes'>
--------------------
b'total 134064\n-rw-------  1 christophemorisset  staff    735972 Feb 11 09:43 ANN.ipynb\n-rw-------  1 christophemorisset  staff    173771 Feb 11 09:43 ANN_1_2.png\n-rw-------  1 christophemorisset  staff     20137 Feb 11 09:43 Calling Fortran.ipynb\n-rw-------  1 christophemorisset  staff     86815 Feb 11 09:43 Calling Fortran.pdf\n-rw-------  1 christophemorisset  staff   3600128 Feb 11 09:43 DR9_photo-z-small.npy\n-rw-------  1 christophemorisset  staff       217 Feb 25 12:46 Demo.pickle\n-rw-------  1 christophemorisset  staff       194 Feb 25 12:45 Demo.pklz\n-rw-------  1 christophemorisset  staff       445 Feb 11 09:43 Evaluacion.ipynb\n-rw-------  1 christophemorisset  staff      7717 Feb 11 09:43 Ex1.ipynb\n-rw-------  1 christophemorisset  staff    381664 Feb 11 09:43 Ex1_done.ipynb\n-rw-------  1 christophemorisset  staff    315341 Feb 11 09:43 Ex1_with_res.ipynb\n-rw-------  1 christophemorisset  staff    120910 Mar 11 11:13 Fig1.pdf\n-

In [97]:
popen = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
res = popen.communicate()[0]
for line in res.decode().split('\n'):
    lspl = line.split()
    if len(lspl)> 4:
        if int(lspl[4]) > 1000000:
            print(f"{lspl[8]:30s} -> {lspl[4]}")

DR9_photo-z-small.npy          -> 3600128
HII.dat                        -> 1177040
Machine                        -> 1477306
MySQL                          -> 1698962
OOP.ipynb                      -> 1166839
cosmo-early.npy                -> 14794880
cosmo-late.npy                 -> 14794880
intro_Matplotlib.ipynb         -> 3729108
intro_Matplotlib.pdf           -> 1677612
n10017o.fits                   -> 8830080
scatter.pdf                    -> 1538520
swp04345.mxhi                  -> 1059840


### re

The re module provides regular expression tools for advanced string processing. For complex matching and manipulation, regular expressions offer succinct, optimized solutions:

In [105]:
import re
text = 'whifch foOt or hand fell f fastest'
# Search the words starting with "f"
matches = re.findall(r'\bf[a-z]*', text)
print(matches)
matches = re.findall(r'\b[fF]\w*', text)
print(matches)

['fo', 'fell', 'f', 'fastest']
['foOt', 'fell', 'f', 'fastest']


In [108]:
# Remove duplicate words
text = 'cat in the the hat the'
print(re.sub(r'(\b[a-z]+) \1', r'\1', text))

cat in the hat the


More in https://docs.python.org/3/library/re.html

### urllib

In [109]:
from urllib.request import urlopen

In [110]:
for line in urlopen('http://dev.on-rev.com/myip.irev'):
    print(line)
    if 'IP' in str(line):
        IP = str(line).split(':')[1][:-7]
print('-------')
print(IP)

b'<html>\n'
b'<body>\n'
b'<p>\n'
b'Remote IP Address:187.225.196.161</p>\n'
b'</body>\n'
b'</html>'
-------
187.225.196.161


### Tables with power: pandas at https://pandas.pydata.org/

Tutorials: 
* https://pandas.pydata.org/docs/user_guide/10min.html
* https://www.learndatasci.com/tutorials/python-pandas-tutorial-complete-introduction-for-beginners/

In [196]:
import pandas as pd

In [246]:
%%writefile data2.dat
# The following data are for test purpose
N    f   x   y type
1   2.3  6   8 star
22   3.5  7   9 galaxy
3  -4.2  5   7 cluster
5  4.55 5 7 cluster
4  -10.5  5  7 star
4  -10.55  5  7 star
#4  -10.5  5  7 test

Overwriting data2.dat


In [283]:
# The N column is used as index. It is not mandatory to have an index defined, if not, pandas will create one.
df = pd.read_csv('data2.dat', comment='#', sep='\\s+', index_col = 'N')

In [284]:
df

Unnamed: 0_level_0,f,x,y,type
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.3,6,8,star
22,3.5,7,9,galaxy
3,-4.2,5,7,cluster
5,4.55,5,7,cluster
4,-10.5,5,7,star
4,-10.55,5,7,star


In [285]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, 1 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   f       6 non-null      float64
 1   x       6 non-null      int64  
 2   y       6 non-null      int64  
 3   type    6 non-null      object 
dtypes: float64(1), int64(2), object(1)
memory usage: 240.0+ bytes


In [286]:
df.describe()

Unnamed: 0,f,x,y
count,6.0,6.0,6.0
mean,-2.483333,5.5,7.5
std,6.934455,0.83666,0.83666
min,-10.55,5.0,7.0
25%,-8.925,5.0,7.0
50%,-0.95,5.0,7.0
75%,3.2,5.75,7.75
max,4.55,7.0,9.0


In [287]:
df['R'] = np.sqrt(df['x']**2 + df.y**2)

In [288]:
# index no needs to be unique
df.index

Index([1, 22, 3, 5, 4, 4], dtype='int64', name='N')

In [289]:
# Access rows by the index
df.loc[4]

Unnamed: 0_level_0,f,x,y,type,R
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,-10.5,5,7,star,8.602325
4,-10.55,5,7,star,8.602325


In [290]:
# Access rows by position
df.iloc[1:3]

Unnamed: 0_level_0,f,x,y,type,R
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22,3.5,7,9,galaxy,11.401754
3,-4.2,5,7,cluster,8.602325


In [291]:
df.iloc[4]['type']

'star'

In [292]:
mask = df['type'] == 'star'
mask_arr = mask.to_numpy()
print(mask)
print(mask_arr)

N
1      True
22    False
3     False
5     False
4      True
4      True
Name: type, dtype: bool
[ True False False False  True  True]


In [294]:
df[mask]['type'] = 'estrella'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[mask]['type'] = 'estrella'


In [304]:
df.loc[mask,'type'] = 'estrella'
# df[mask]['type'] = 'estrella' # this will not work
# df[mask_arr]['type'] = 'estrella' # this will not work

In [305]:
df

Unnamed: 0_level_0,f,x,y,type,R
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.3,6,8,estrella,10.0
22,3.5,7,9,galaxy,11.401754
3,-4.2,5,7,cluster,8.602325
5,4.55,5,7,cluster,8.602325
4,-10.5,5,7,estrella,8.602325
4,-10.55,5,7,estrella,8.602325


In [306]:
df[mask]['R']

N
1    10.000000
4     8.602325
4     8.602325
Name: R, dtype: float64

In [307]:
df.loc[mask,'R'].to_numpy()

array([10.        ,  8.60232527,  8.60232527])

In [308]:
coord_cols = ['x', 'y']
df.loc[mask, coord_cols].mean()

x    5.333333
y    7.333333
dtype: float64

In [317]:
df.columns

Index(['f', 'x', 'y', 'type', 'R'], dtype='object')

In [318]:
df.groupby('type')[df.columns].apply(print)

      f  x  y     type         R
N                               
3 -4.20  5  7  cluster  8.602325
5  4.55  5  7  cluster  8.602325
       f  x  y      type          R
N                                  
1   2.30  6  8  estrella  10.000000
4 -10.50  5  7  estrella   8.602325
4 -10.55  5  7  estrella   8.602325
      f  x  y    type          R
N                               
22  3.5  7  9  galaxy  11.401754


In [312]:
df.groupby('type').mean()

Unnamed: 0_level_0,f,x,y,R
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cluster,0.175,5.0,7.0,8.602325
estrella,-6.25,5.333333,7.333333,9.068217
galaxy,3.5,7.0,9.0,11.401754
