In [225]:
import numpy as np
import pandas as pd

In [226]:
s = pd.Series([1, 3, 4, np.nan, 6, 8])
s

0    1.0
1    3.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [227]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [228]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.349259,0.873853,-0.341679,0.087976
2013-01-02,-0.71843,0.397931,0.447592,-0.038003
2013-01-03,1.140414,0.178683,-1.558841,-0.653753
2013-01-04,-0.543959,0.155067,-0.65033,0.454258
2013-01-05,-0.47626,0.277233,0.850048,-0.443929
2013-01-06,-0.695178,-0.449937,0.533456,0.866784


In [229]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2, df2.shape

(     A          B    C  D      E    F
 0  1.0 2013-01-02  1.0  3   test  foo
 1  1.0 2013-01-02  1.0  3  train  foo
 2  1.0 2013-01-02  1.0  3   test  foo
 3  1.0 2013-01-02  1.0  3  train  foo,
 (4, 6))

In [230]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [231]:
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [232]:
df2[df2.index >= 2]

Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [233]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [234]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.349259,0.873853,-0.341679,0.087976
2013-01-02,-0.71843,0.397931,0.447592,-0.038003
2013-01-03,1.140414,0.178683,-1.558841,-0.653753
2013-01-04,-0.543959,0.155067,-0.65033,0.454258
2013-01-05,-0.47626,0.277233,0.850048,-0.443929
2013-01-06,-0.695178,-0.449937,0.533456,0.866784


In [235]:
df.to_numpy()

array([[ 0.34925895,  0.87385279, -0.34167924,  0.08797556],
       [-0.71843036,  0.39793101,  0.44759235, -0.03800342],
       [ 1.14041401,  0.17868322, -1.55884143, -0.65375323],
       [-0.5439592 ,  0.15506664, -0.65033037,  0.4542576 ],
       [-0.47626004,  0.27723256,  0.85004797, -0.44392938],
       [-0.69517756, -0.44993663,  0.53345583,  0.86678443]])

In [236]:
df2_numpy = df2.to_numpy()
df2_numpy

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [237]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [238]:
df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
C,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
D,4.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0


In [239]:
df2.describe().T['mean']

A    1.0
C    1.0
D    3.0
Name: mean, dtype: float64

In [240]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.087976,-0.341679,0.873853,0.349259
2013-01-02,-0.038003,0.447592,0.397931,-0.71843
2013-01-03,-0.653753,-1.558841,0.178683,1.140414
2013-01-04,0.454258,-0.65033,0.155067,-0.543959
2013-01-05,-0.443929,0.850048,0.277233,-0.47626
2013-01-06,0.866784,0.533456,-0.449937,-0.695178


In [241]:
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2013-01-02,-0.71843,0.397931,0.447592,-0.038003
2013-01-06,-0.695178,-0.449937,0.533456,0.866784
2013-01-04,-0.543959,0.155067,-0.65033,0.454258
2013-01-05,-0.47626,0.277233,0.850048,-0.443929
2013-01-01,0.349259,0.873853,-0.341679,0.087976
2013-01-03,1.140414,0.178683,-1.558841,-0.653753


In [242]:
round(df.at["2013-01-01", 'A'], ndigits=3)

0.349

In [243]:
round(df.loc['2013-01-01'].at['A'], ndigits=3)

0.349

In [244]:
round(df.iloc[0].at['A'], ndigits=3)

0.349

In [245]:
df.loc[:, ["A", "B"]].min()

A   -0.718430
B   -0.449937
dtype: float64

In [246]:
df.iloc[[1, 2, 3], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.71843,0.447592
2013-01-03,1.140414,-1.558841
2013-01-04,-0.543959,-0.65033


In [247]:
round(df.iat[1, 1], ndigits=3)

0.398

In [248]:
df[(df["A"] > 0) & (df["B"] < 0)]

Unnamed: 0,A,B,C,D


In [249]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.349259,0.873853,-0.341679,0.087976,one
2013-01-02,-0.71843,0.397931,0.447592,-0.038003,one
2013-01-03,1.140414,0.178683,-1.558841,-0.653753,two
2013-01-04,-0.543959,0.155067,-0.65033,0.454258,three
2013-01-05,-0.47626,0.277233,0.850048,-0.443929,four
2013-01-06,-0.695178,-0.449937,0.533456,0.866784,three


In [250]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.140414,0.178683,-1.558841,-0.653753,two
2013-01-05,-0.47626,0.277233,0.850048,-0.443929,four


In [251]:
s1 = pd.Series(np.arange(6, dtype=np.int16) + 1, index=pd.date_range("20130102", periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int16

In [252]:
df.at[dates[0], "A"] = 0
round(df["A"], ndigits=3)

2013-01-01    0.000
2013-01-02   -0.718
2013-01-03    1.140
2013-01-04   -0.544
2013-01-05   -0.476
2013-01-06   -0.695
Freq: D, Name: A, dtype: float64

In [253]:
df.loc[:, "D"] = 5.0
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.873853,-0.341679,5.0
2013-01-02,-0.71843,0.397931,0.447592,5.0
2013-01-03,1.140414,0.178683,-1.558841,5.0
2013-01-04,-0.543959,0.155067,-0.65033,5.0
2013-01-05,-0.47626,0.277233,0.850048,5.0
2013-01-06,-0.695178,-0.449937,0.533456,5.0


In [254]:
df2 = df.copy()
df2 = df2.iloc[:, :-1]
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C
2013-01-01,0.0,-0.873853,-0.341679
2013-01-02,-0.71843,-0.397931,-0.447592
2013-01-03,-1.140414,-0.178683,-1.558841
2013-01-04,-0.543959,-0.155067,-0.65033
2013-01-05,-0.47626,-0.277233,-0.850048
2013-01-06,-0.695178,-0.449937,-0.533456


In [255]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.873853,-0.341679,5.0
2013-01-02,-0.71843,0.397931,0.447592,5.0
2013-01-03,1.140414,0.178683,-1.558841,5.0
2013-01-04,-0.543959,0.155067,-0.65033,5.0
2013-01-05,-0.47626,0.277233,0.850048,5.0
2013-01-06,-0.695178,-0.449937,0.533456,5.0


In [256]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns))
df1.loc[dates[0] : dates[1], "E"] = 1
df1


Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.873853,-0.341679,5.0,1.0
2013-01-02,-0.71843,0.397931,0.447592,5.0,1.0
2013-01-03,1.140414,0.178683,-1.558841,5.0,
2013-01-04,-0.543959,0.155067,-0.65033,5.0,


In [257]:
df1.loc[df.index[0:2]] = np.nan
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,1.140414,0.178683,-1.558841,5.0,
2013-01-04,-0.543959,0.155067,-0.65033,5.0,


In [258]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E


In [259]:
df1.fillna(value=round(np.exp(1), ndigits=2))

Unnamed: 0,A,B,C,D,E
2013-01-01,2.72,2.72,2.72,2.72,2.72
2013-01-02,2.72,2.72,2.72,2.72,2.72
2013-01-03,1.140414,0.178683,-1.558841,5.0,2.72
2013-01-04,-0.543959,0.155067,-0.65033,5.0,2.72


In [260]:
pd.isna(df1), df1

(                A      B      C      D     E
 2013-01-01   True   True   True   True  True
 2013-01-02   True   True   True   True  True
 2013-01-03  False  False  False  False  True
 2013-01-04  False  False  False  False  True,
                    A         B         C    D   E
 2013-01-01       NaN       NaN       NaN  NaN NaN
 2013-01-02       NaN       NaN       NaN  NaN NaN
 2013-01-03  1.140414  0.178683 -1.558841  5.0 NaN
 2013-01-04 -0.543959  0.155067 -0.650330  5.0 NaN)

In [261]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [262]:
s = s.shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [263]:
df = df.loc[:, df.columns[:-1]]

In [264]:
df.agg(lambda x: np.mean(x) * 5.6)

A   -1.207186
B    1.337308
C   -0.671771
dtype: float64

In [265]:
df.transform(lambda x: x * 101.2)

Unnamed: 0,A,B,C
2013-01-01,0.0,88.433902,-34.577939
2013-01-02,-72.705152,40.270618,45.296346
2013-01-03,115.409898,18.082742,-157.754752
2013-01-04,-55.048671,15.692744,-65.813434
2013-01-05,-48.197517,28.055935,86.024854
2013-01-06,-70.351969,-45.533587,53.98573


In [266]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    3
1    0
2    6
3    6
4    4
5    6
6    0
7    5
8    1
9    5
dtype: int64

In [267]:
s.value_counts()

6    3
0    2
5    2
3    1
4    1
1    1
dtype: int64

In [268]:
s.index.value_counts().max()

1

In [269]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [270]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.101925,0.322456,-0.118361,0.758116
1,-1.110965,-0.122456,-0.408571,0.083418
2,-0.765494,0.062764,0.067708,0.031744
3,0.686876,-0.81254,0.127698,0.410308
4,1.782814,-0.737929,0.371302,-0.90583
5,0.748135,0.169411,1.126844,0.355573
6,1.188972,1.475464,-1.942922,0.297821
7,0.208282,0.161867,0.134245,-1.936593
8,0.780768,-0.987693,-1.648126,-0.529154
9,0.495181,-1.615918,0.864469,0.840031


In [271]:
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces) == df

Unnamed: 0,0,1,2,3
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,True,True
4,True,True,True,True
5,True,True,True,True
6,True,True,True,True
7,True,True,True,True
8,True,True,True,True
9,True,True,True,True


In [272]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [273]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [274]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [275]:
right.key[1] = "bar"
right

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  right.key[1] = "bar"


Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [276]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,2,4


In [277]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,0.676328,-0.089425
1,bar,one,-1.541544,-0.824094
2,foo,two,0.25207,0.531858
3,bar,three,1.577457,0.799073
4,foo,two,1.785854,1.363295
5,bar,two,0.145288,-0.122515
6,foo,one,-0.437098,0.685838
7,foo,three,1.091068,0.678976


In [278]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.541544,-0.824094
bar,three,1.577457,0.799073
bar,two,0.145288,-0.122515
foo,one,0.23923,0.596413
foo,three,1.091068,0.678976
foo,two,2.037924,1.895153


In [279]:
df.groupby("A")[["C", "D"]].sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.1812,-0.147536
foo,3.368222,3.170543


In [280]:
df.loc[df['A'] == 'bar', 'C'].sum(), df.loc[df['A'] == 'bar', 'D'].sum()

(0.18119997694909773, -0.14753561882666055)

In [281]:
arrays = [
   ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
   ["one", "two", "one", "two", "one", "two", "one", "two"],
]

index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df2 = df[:4]
df2, df2.shape

(                     A         B
 first second                    
 bar   one     1.321138  0.125570
       two    -2.072234  0.626598
 baz   one     2.104498  0.459939
       two    -1.438718  2.291296,
 (4, 2))

In [282]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.321138,0.12557
bar,two,-2.072234,0.626598
baz,one,2.104498,0.459939
baz,two,-1.438718,2.291296
foo,one,-0.329586,0.002381
foo,two,0.858364,0.595937
qux,one,-0.962487,-1.092419
qux,two,-1.314339,1.032282


In [283]:
stacked = df2.stack()
stacked, stacked.shape

(first  second   
 bar    one     A    1.321138
                B    0.125570
        two     A   -2.072234
                B    0.626598
 baz    one     A    2.104498
                B    0.459939
        two     A   -1.438718
                B    2.291296
 dtype: float64,
 (8,))

In [284]:
unstacked = stacked.unstack()
unstacked == df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,True,True
bar,two,True,True
baz,one,True,True
baz,two,True,True


In [285]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.554994,0.946801
1,one,B,foo,-0.854316,0.034718
2,two,C,foo,0.968049,0.898913
3,three,A,bar,-0.314569,-0.548005
4,one,B,bar,-2.195957,1.129785
5,one,C,bar,-0.134497,-0.524102
6,two,A,foo,1.469156,-0.855606
7,three,B,foo,-1.096829,1.368888
8,one,C,foo,0.340912,-0.962386
9,one,A,bar,-0.249265,-1.12394


In [286]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.249265,-0.554994
one,B,-2.195957,-0.854316
one,C,-0.134497,0.340912
three,A,-0.314569,
three,B,,-1.096829
three,C,0.18277,
two,A,,1.469156
two,B,2.48239,
two,C,,0.968049


In [288]:
try:
    from pandas.tseries.frequencies import FreqGroup, get_freq_code
except ImportError:
    from pandas._libs.tslibs.dtypes import FreqGroup
    from pandas.tests.tslibs.test_period_asfreq import get_freq_code

# Generate datetime index with 100 seconds frequency
rng = pd.date_range("1/1/2012", periods=100, freq="s")

# Create a time series with random integer values
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

# Resample the series to sum values over 5-minute intervals
sum_5min = ts.resample("5Min").sum()

print(sum_5min)

ModuleNotFoundError: No module named 'pandas.tests.tslibs.test_period_asfreq'

In [None]:
print(pd.__version__)

1.5.3
