In [1]:
import numpy as np
import pandas as pd
from django.utils import timezone
from project.models import Project

In [2]:
# create a serie
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# Create 12 months range with pandas date_range
start_year = f"{timezone.now().year}0101"
print(start_year)
dates = pd.date_range(start_year, periods=12)
dates

20230101


DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12'],
              dtype='datetime64[ns]', freq='D')

In [4]:
np.random.randn(12, 4)

array([[ 0.45617107,  0.0494366 ,  0.66882714,  0.06348142],
       [ 0.41644779,  1.92421611,  2.27004679,  1.35928577],
       [-0.86144533, -0.64345315,  1.97016184, -0.39148387],
       [-0.96122902,  1.01867434,  0.28380381,  0.11490189],
       [-0.79903376, -0.15898906,  0.21268186, -0.69630612],
       [-0.02772689,  1.55195656, -0.70922196, -2.21707534],
       [ 1.31359025, -0.29240146,  1.43764078, -0.27051164],
       [-0.04099131,  2.21628912,  0.40728345, -1.2428867 ],
       [-0.82653359,  0.81464229, -0.78536703,  0.13841154],
       [ 1.84927664, -0.35395997, -0.99791291,  1.08663304],
       [-0.51959152, -1.41194527, -0.47834136, -0.7454151 ],
       [-0.38187073,  0.57273535, -1.29850255,  0.70269689]])

In [5]:
# create a dataframe
# index = lignes or rows
df = pd.DataFrame(np.random.randn(12, 8), index=dates, columns=list("ABCDEFGH"))
df

Unnamed: 0,A,B,C,D,E,F,G,H
2023-01-01,-0.527266,-0.694275,-0.17846,-0.527954,1.594993,-0.865541,-0.219944,0.115406
2023-01-02,-0.222175,0.440893,-0.666219,1.711034,0.047138,0.61188,-1.9991,0.062626
2023-01-03,0.166347,-1.566303,-3.334114,-0.386486,0.101729,0.039638,-1.569193,1.336126
2023-01-04,-0.677812,1.143132,0.237951,-0.694721,-0.740539,1.796461,0.084031,-0.379196
2023-01-05,-0.373964,0.24861,1.10199,-0.314119,-1.022939,1.572719,1.203805,-2.421146
2023-01-06,0.910326,-0.637671,-0.598733,-1.742797,-0.638165,0.061818,0.323624,1.091594
2023-01-07,-0.422921,-0.321069,-0.455514,-0.361284,0.438306,0.3175,1.198392,0.501664
2023-01-08,-0.719987,1.308499,0.793471,-1.399481,-0.156834,0.377707,-0.467302,0.174029
2023-01-09,-0.072706,-0.118497,-1.010448,-0.162207,0.522082,1.122719,-0.452163,1.233398
2023-01-10,-1.151191,0.627199,-0.773136,-0.280737,-0.334952,-2.39868,2.378589,-0.060116


In [7]:
# select october avec le label
# attention au format US qui inverse mois et jour :)
df.loc["20230110"]

A   -1.151191
B    0.627199
C   -0.773136
D   -0.280737
E   -0.334952
F   -2.398680
G    2.378589
H   -0.060116
Name: 2023-01-10 00:00:00, dtype: float64

In [8]:
# sélection de lignes en utilisant un index iloc
df.iloc[8:10]

Unnamed: 0,A,B,C,D,E,F,G,H
2023-01-09,-0.072706,-0.118497,-1.010448,-0.162207,0.522082,1.122719,-0.452163,1.233398
2023-01-10,-1.151191,0.627199,-0.773136,-0.280737,-0.334952,-2.39868,2.378589,-0.060116


In [9]:
# df["B":"E"] ==> error, can't slice by columns
# slicing rows (indexes)
df["20210102":"20210104"]

Unnamed: 0,A,B,C,D,E,F,G,H


In [10]:
# subset avec les mois juillet à octobre et uniquement les colonnes A, B et E
df.loc["20210107":"20210110", ['B', 'A', 'E']]

Unnamed: 0,B,A,E


In [11]:
# seulement juillet à otobre et toutes les colonnes de B à E
df.loc[["20210107","20210110"], "B":"E"]

KeyError: "None of [Index(['20210107', '20210110'], dtype='object')] are in the [index]"

In [12]:
# accès à une colonne en utilisant un attribut
df.D

2023-01-01   -0.527954
2023-01-02    1.711034
2023-01-03   -0.386486
2023-01-04   -0.694721
2023-01-05   -0.314119
2023-01-06   -1.742797
2023-01-07   -0.361284
2023-01-08   -1.399481
2023-01-09   -0.162207
2023-01-10   -0.280737
2023-01-11   -0.412176
2023-01-12   -0.932114
Freq: D, Name: D, dtype: float64

In [13]:
# accès à un élément précis
df.D["20210108"]

KeyError: '20210108'

In [14]:
# identique à :
df["D"]["20210108"]

KeyError: '20210108'

In [15]:
df["D"]["20210108"] = "Hello"
df["D"]["20210108"]

KeyError: '20210108'

In [16]:
df

Unnamed: 0,A,B,C,D,E,F,G,H
2023-01-01,-0.527266,-0.694275,-0.17846,-0.527954,1.594993,-0.865541,-0.219944,0.115406
2023-01-02,-0.222175,0.440893,-0.666219,1.711034,0.047138,0.61188,-1.9991,0.062626
2023-01-03,0.166347,-1.566303,-3.334114,-0.386486,0.101729,0.039638,-1.569193,1.336126
2023-01-04,-0.677812,1.143132,0.237951,-0.694721,-0.740539,1.796461,0.084031,-0.379196
2023-01-05,-0.373964,0.24861,1.10199,-0.314119,-1.022939,1.572719,1.203805,-2.421146
2023-01-06,0.910326,-0.637671,-0.598733,-1.742797,-0.638165,0.061818,0.323624,1.091594
2023-01-07,-0.422921,-0.321069,-0.455514,-0.361284,0.438306,0.3175,1.198392,0.501664
2023-01-08,-0.719987,1.308499,0.793471,-1.399481,-0.156834,0.377707,-0.467302,0.174029
2023-01-09,-0.072706,-0.118497,-1.010448,-0.162207,0.522082,1.122719,-0.452163,1.233398
2023-01-10,-1.151191,0.627199,-0.773136,-0.280737,-0.334952,-2.39868,2.378589,-0.060116


In [17]:
# add a new colum
df['NEW'] = list(range(len(df.index)))
df

Unnamed: 0,A,B,C,D,E,F,G,H,NEW
2023-01-01,-0.527266,-0.694275,-0.17846,-0.527954,1.594993,-0.865541,-0.219944,0.115406,0
2023-01-02,-0.222175,0.440893,-0.666219,1.711034,0.047138,0.61188,-1.9991,0.062626,1
2023-01-03,0.166347,-1.566303,-3.334114,-0.386486,0.101729,0.039638,-1.569193,1.336126,2
2023-01-04,-0.677812,1.143132,0.237951,-0.694721,-0.740539,1.796461,0.084031,-0.379196,3
2023-01-05,-0.373964,0.24861,1.10199,-0.314119,-1.022939,1.572719,1.203805,-2.421146,4
2023-01-06,0.910326,-0.637671,-0.598733,-1.742797,-0.638165,0.061818,0.323624,1.091594,5
2023-01-07,-0.422921,-0.321069,-0.455514,-0.361284,0.438306,0.3175,1.198392,0.501664,6
2023-01-08,-0.719987,1.308499,0.793471,-1.399481,-0.156834,0.377707,-0.467302,0.174029,7
2023-01-09,-0.072706,-0.118497,-1.010448,-0.162207,0.522082,1.122719,-0.452163,1.233398,8
2023-01-10,-1.151191,0.627199,-0.773136,-0.280737,-0.334952,-2.39868,2.378589,-0.060116,9


In [18]:
# copy
dfa = df.copy()
# swap B and NEW values
dfa.loc[:, ['B', 'NEW']] = dfa[['NEW', 'B']].to_numpy()
dfa

Unnamed: 0,A,B,C,D,E,F,G,H,NEW
2023-01-01,-0.527266,0.0,-0.17846,-0.527954,1.594993,-0.865541,-0.219944,0.115406,-0.694275
2023-01-02,-0.222175,1.0,-0.666219,1.711034,0.047138,0.61188,-1.9991,0.062626,0.440893
2023-01-03,0.166347,2.0,-3.334114,-0.386486,0.101729,0.039638,-1.569193,1.336126,-1.566303
2023-01-04,-0.677812,3.0,0.237951,-0.694721,-0.740539,1.796461,0.084031,-0.379196,1.143132
2023-01-05,-0.373964,4.0,1.10199,-0.314119,-1.022939,1.572719,1.203805,-2.421146,0.24861
2023-01-06,0.910326,5.0,-0.598733,-1.742797,-0.638165,0.061818,0.323624,1.091594,-0.637671
2023-01-07,-0.422921,6.0,-0.455514,-0.361284,0.438306,0.3175,1.198392,0.501664,-0.321069
2023-01-08,-0.719987,7.0,0.793471,-1.399481,-0.156834,0.377707,-0.467302,0.174029,1.308499
2023-01-09,-0.072706,8.0,-1.010448,-0.162207,0.522082,1.122719,-0.452163,1.233398,-0.118497
2023-01-10,-1.151191,9.0,-0.773136,-0.280737,-0.334952,-2.39868,2.378589,-0.060116,0.627199


In [19]:
# check df is unchanged
df

Unnamed: 0,A,B,C,D,E,F,G,H,NEW
2023-01-01,-0.527266,-0.694275,-0.17846,-0.527954,1.594993,-0.865541,-0.219944,0.115406,0
2023-01-02,-0.222175,0.440893,-0.666219,1.711034,0.047138,0.61188,-1.9991,0.062626,1
2023-01-03,0.166347,-1.566303,-3.334114,-0.386486,0.101729,0.039638,-1.569193,1.336126,2
2023-01-04,-0.677812,1.143132,0.237951,-0.694721,-0.740539,1.796461,0.084031,-0.379196,3
2023-01-05,-0.373964,0.24861,1.10199,-0.314119,-1.022939,1.572719,1.203805,-2.421146,4
2023-01-06,0.910326,-0.637671,-0.598733,-1.742797,-0.638165,0.061818,0.323624,1.091594,5
2023-01-07,-0.422921,-0.321069,-0.455514,-0.361284,0.438306,0.3175,1.198392,0.501664,6
2023-01-08,-0.719987,1.308499,0.793471,-1.399481,-0.156834,0.377707,-0.467302,0.174029,7
2023-01-09,-0.072706,-0.118497,-1.010448,-0.162207,0.522082,1.122719,-0.452163,1.233398,8
2023-01-10,-1.151191,0.627199,-0.773136,-0.280737,-0.334952,-2.39868,2.378589,-0.060116,9


In [20]:
# sélectionner uniquement les lignes pour lesquelles la valeur de la cellule en 1 est supérieur à 1
df[df["A"] > 1]

Unnamed: 0,A,B,C,D,E,F,G,H,NEW


In [21]:
# on change la valeur "hello" pour une valeur numérique
# on utilise .at
df.at["20210108", "D"] = 1000
# on ne sélectionne que les valeurs qui sont plus grandes que 0
df[df > 0]

Unnamed: 0,A,B,C,D,E,F,G,H,NEW
2023-01-01,,,,,1.594993,,,0.115406,
2023-01-02,,0.440893,,1.711034,0.047138,0.61188,,0.062626,1.0
2023-01-03,0.166347,,,,0.101729,0.039638,,1.336126,2.0
2023-01-04,,1.143132,0.237951,,,1.796461,0.084031,,3.0
2023-01-05,,0.24861,1.10199,,,1.572719,1.203805,,4.0
2023-01-06,0.910326,,,,,0.061818,0.323624,1.091594,5.0
2023-01-07,,,,,0.438306,0.3175,1.198392,0.501664,6.0
2023-01-08,,1.308499,0.793471,,,0.377707,,0.174029,7.0
2023-01-09,,,,,0.522082,1.122719,,1.233398,8.0
2023-01-10,,0.627199,,,,,2.378589,,9.0


In [22]:
# transpose la matrice
df.T

Unnamed: 0,2023-01-01,2023-01-02,2023-01-03,2023-01-04,2023-01-05,2023-01-06,2023-01-07,2023-01-08,2023-01-09,2023-01-10,2023-01-11,2023-01-12,2021-01-08
A,-0.527266,-0.222175,0.166347,-0.677812,-0.373964,0.910326,-0.422921,-0.719987,-0.072706,-1.151191,0.109129,-0.259272,
B,-0.694275,0.440893,-1.566303,1.143132,0.24861,-0.637671,-0.321069,1.308499,-0.118497,0.627199,0.564366,-0.324706,
C,-0.17846,-0.666219,-3.334114,0.237951,1.10199,-0.598733,-0.455514,0.793471,-1.010448,-0.773136,0.035396,-1.330759,
D,-0.527954,1.711034,-0.386486,-0.694721,-0.314119,-1.742797,-0.361284,-1.399481,-0.162207,-0.280737,-0.412176,-0.932114,1000.0
E,1.594993,0.047138,0.101729,-0.740539,-1.022939,-0.638165,0.438306,-0.156834,0.522082,-0.334952,-1.185931,-0.450189,
F,-0.865541,0.61188,0.039638,1.796461,1.572719,0.061818,0.3175,0.377707,1.122719,-2.39868,3.019937,-0.196375,
G,-0.219944,-1.9991,-1.569193,0.084031,1.203805,0.323624,1.198392,-0.467302,-0.452163,2.378589,-0.340804,0.904138,
H,0.115406,0.062626,1.336126,-0.379196,-2.421146,1.091594,0.501664,0.174029,1.233398,-0.060116,0.818458,0.530046,
NEW,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,


In [23]:
# copy only a subset of the dataframe (6 first lines)
df2 = df.iloc[0:6].copy()
# replace column value
df2["E"] = ["one", "one", "two", "three", "four", "three"]
# select only rows if value in column E is two or four
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E,F,G,H,NEW
2023-01-03,0.166347,-1.566303,-3.334114,-0.386486,two,0.039638,-1.569193,1.336126,2.0
2023-01-05,-0.373964,0.24861,1.10199,-0.314119,four,1.572719,1.203805,-2.421146,4.0


In [24]:
# sélection basée sur une valeur booléenne
df2["E"].isin(["two", "four"])

2023-01-01    False
2023-01-02    False
2023-01-03     True
2023-01-04    False
2023-01-05     True
2023-01-06    False
Name: E, dtype: bool

In [25]:
# sélectionne les lignes impaires
df2[[i%2 == 0 for i in range(6)]]

Unnamed: 0,A,B,C,D,E,F,G,H,NEW
2023-01-01,-0.527266,-0.694275,-0.17846,-0.527954,one,-0.865541,-0.219944,0.115406,0.0
2023-01-03,0.166347,-1.566303,-3.334114,-0.386486,two,0.039638,-1.569193,1.336126,2.0
2023-01-05,-0.373964,0.24861,1.10199,-0.314119,four,1.572719,1.203805,-2.421146,4.0


In [26]:
# créer une série qui ne couvre que partiellement les index de df
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20210102", periods=6))
s1

2021-01-02    1
2021-01-03    2
2021-01-04    3
2021-01-05    4
2021-01-06    5
2021-01-07    6
Freq: D, dtype: int64

In [27]:
# ajouter cette colonne à df, remarquez que c'est placé correctement dans le tableau
df["I"] = s1
df

Unnamed: 0,A,B,C,D,E,F,G,H,NEW,I
2023-01-01,-0.527266,-0.694275,-0.17846,-0.527954,1.594993,-0.865541,-0.219944,0.115406,0.0,
2023-01-02,-0.222175,0.440893,-0.666219,1.711034,0.047138,0.61188,-1.9991,0.062626,1.0,
2023-01-03,0.166347,-1.566303,-3.334114,-0.386486,0.101729,0.039638,-1.569193,1.336126,2.0,
2023-01-04,-0.677812,1.143132,0.237951,-0.694721,-0.740539,1.796461,0.084031,-0.379196,3.0,
2023-01-05,-0.373964,0.24861,1.10199,-0.314119,-1.022939,1.572719,1.203805,-2.421146,4.0,
2023-01-06,0.910326,-0.637671,-0.598733,-1.742797,-0.638165,0.061818,0.323624,1.091594,5.0,
2023-01-07,-0.422921,-0.321069,-0.455514,-0.361284,0.438306,0.3175,1.198392,0.501664,6.0,
2023-01-08,-0.719987,1.308499,0.793471,-1.399481,-0.156834,0.377707,-0.467302,0.174029,7.0,
2023-01-09,-0.072706,-0.118497,-1.010448,-0.162207,0.522082,1.122719,-0.452163,1.233398,8.0,
2023-01-10,-1.151191,0.627199,-0.773136,-0.280737,-0.334952,-2.39868,2.378589,-0.060116,9.0,


In [28]:
# in place modification of the data
# with "where" sélection
df2 = df.iloc[0:4, 0:4].copy().fillna(0)
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2023-01-01,-0.527266,-0.694275,-0.17846,-0.527954
2023-01-02,-0.222175,-0.440893,-0.666219,-1.711034
2023-01-03,-0.166347,-1.566303,-3.334114,-0.386486
2023-01-04,-0.677812,-1.143132,-0.237951,-0.694721


In [34]:
project = Project.objects.all().first()
data = [[k,v] for k, v in project.get_pop_change_per_year("pop").items()]
df = pd.DataFrame(data, columns=["year", "pop"]).fillna("")
df

Unnamed: 0,year,pop
0,2011,443.0
1,2012,200.0
2,2013,65.0
3,2014,-32.0
4,2015,152.0
5,2016,-314.0
6,2017,298.0
7,2018,220.0
8,2019,-37.0
9,2020,


In [37]:
df.pivot(columns=["year"], inex=["pop"], values="pop")

TypeError: pivot() got an unexpected keyword argument 'inex'

In [32]:
project.get_pop_change_per_year("pop")

{'2011': 443,
 '2012': 200,
 '2013': 65,
 '2014': -32,
 '2015': 152,
 '2016': -314,
 '2017': 298,
 '2018': 220,
 '2019': -37,
 '2020': None}