##  📚 Essential Basic Functionality

> Pandas is a foundational library in Python for working with structured data. It provides fast, flexible, and expressive tools designed to make data analysis and manipulation easy and intuitive. There are several essential functionalities that are fundamental to using Pandas effectively.

In [1]:
import pandas as pd
import numpy as np

### Head and tail

- Head shows the first 5 rows by default.
- Tail shows the last 5 rows by default.

In [2]:
long_series = pd.Series(np.random.randn(10))
long_series

0    0.361703
1    0.531152
2   -0.729626
3   -0.399157
4    0.146409
5    1.663781
6   -0.502446
7   -2.369366
8   -1.808889
9   -0.055850
dtype: float64

In [3]:
# Head
long_series.head()

0    0.361703
1    0.531152
2   -0.729626
3   -0.399157
4    0.146409
dtype: float64

In [4]:
# Tail
long_series.tail()

5    1.663781
6   -0.502446
7   -2.369366
8   -1.808889
9   -0.055850
dtype: float64

### Attributes and underlying data

- shape: gives the axis dimensions of the object, consistent with ndarray
- Axis labels:
    - Series: index (only axis)
    - DataFrame: index and columns


In [5]:
index = pd.date_range('20230101', periods=10)
df = pd.DataFrame(np.random.randn(10, 4), index=index, columns=list ('ABCD'))
df

Unnamed: 0,A,B,C,D
2023-01-01,0.195951,-0.867703,0.658863,1.732854
2023-01-02,1.719817,1.521598,1.10796,-0.686281
2023-01-03,-0.375715,-2.488942,-0.111073,1.889208
2023-01-04,-0.535895,-1.27156,-0.86332,1.522724
2023-01-05,-0.46632,0.297649,-1.268906,0.051341
2023-01-06,1.353237,0.767641,0.803081,1.434357
2023-01-07,0.528853,-1.080959,0.066283,-0.469669
2023-01-08,-0.057852,-0.138649,2.171557,-1.373868
2023-01-09,-1.286524,-0.82289,-1.384414,1.632486
2023-01-10,0.48817,-1.144366,-0.088328,-0.951186


In [6]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c,d
2023-01-01,0.195951,-0.867703,0.658863,1.732854
2023-01-02,1.719817,1.521598,1.10796,-0.686281
2023-01-03,-0.375715,-2.488942,-0.111073,1.889208
2023-01-04,-0.535895,-1.27156,-0.86332,1.522724
2023-01-05,-0.46632,0.297649,-1.268906,0.051341
2023-01-06,1.353237,0.767641,0.803081,1.434357
2023-01-07,0.528853,-1.080959,0.066283,-0.469669
2023-01-08,-0.057852,-0.138649,2.171557,-1.373868
2023-01-09,-1.286524,-0.82289,-1.384414,1.632486
2023-01-10,0.48817,-1.144366,-0.088328,-0.951186


#### Numpy

- It is a reliable and consistent method to convert pandas objects to NumPy arrays, offering better control over data types and compatibility with extension types compared other methods.

In [7]:
df_numpy = df.to_numpy()
df_numpy

array([[ 0.19595065, -0.86770305,  0.65886291,  1.73285379],
       [ 1.71981727,  1.52159782,  1.10795966, -0.68628149],
       [-0.37571487, -2.48894164, -0.11107342,  1.88920798],
       [-0.53589531, -1.27156034, -0.86331967,  1.52272447],
       [-0.46632014,  0.29764864, -1.26890564,  0.05134071],
       [ 1.35323735,  0.76764109,  0.80308148,  1.43435682],
       [ 0.52885318, -1.08095872,  0.06628292, -0.46966863],
       [-0.05785166, -0.13864885,  2.17155736, -1.3738685 ],
       [-1.28652362, -0.82288953, -1.38441419,  1.63248582],
       [ 0.4881698 , -1.14436556, -0.08832809, -0.95118638]])

## Matching / broadcasting behavior

In [8]:
df_mathing_broadcasting = pd.DataFrame(
    {
        "one": pd.Series(np.random.randint(0, 10, 3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randint(0, 10, 3), index=["a", "b", "c"]),
        "three": pd.Series(np.random.randint(0, 10, 4), index=["a", "b", "c", "d"]),

    }
)
df_mathing_broadcasting

Unnamed: 0,one,two,three
a,0.0,5.0,3
b,9.0,5.0,2
c,1.0,9.0,8
d,,,2


### Sub

- It is a method used to perform subtraction between Series or DataFrames.

In [9]:
df_sub_l1 = df_mathing_broadcasting.iloc[2]
df_sub_l1

one      1.0
two      9.0
three    8.0
Name: c, dtype: float64

In [10]:
df_mathing_broadcasting.sub(df_sub_l1 , axis="columns")

Unnamed: 0,one,two,three
a,-1.0,-4.0,-5.0
b,8.0,-4.0,-6.0
c,0.0,0.0,0.0
d,,,-6.0


## Missing data / operations with fill values

In [11]:
df_filldata = df.copy()
df_filldata.iloc[2, 2] = np.nan
df_filldata.head()

Unnamed: 0,a,b,c,d
2023-01-01,0.195951,-0.867703,0.658863,1.732854
2023-01-02,1.719817,1.521598,1.10796,-0.686281
2023-01-03,-0.375715,-2.488942,,1.889208
2023-01-04,-0.535895,-1.27156,-0.86332,1.522724
2023-01-05,-0.46632,0.297649,-1.268906,0.051341


In [12]:
df_filldata.fillna(np.random.rand())
df_filldata.head()

Unnamed: 0,a,b,c,d
2023-01-01,0.195951,-0.867703,0.658863,1.732854
2023-01-02,1.719817,1.521598,1.10796,-0.686281
2023-01-03,-0.375715,-2.488942,,1.889208
2023-01-04,-0.535895,-1.27156,-0.86332,1.522724
2023-01-05,-0.46632,0.297649,-1.268906,0.051341


## Boolean reductions

- It is a way to summarize a boolean result.
-  <code>empty, any(), all(), and bool()</code>

In [13]:
df_bool_reduction = pd.DataFrame({
    "A": [1, 2, 3],
    "B": [0, 0, 5],
    "C": [9, 9, 9]
})

df_bool_reduction

Unnamed: 0,A,B,C
0,1,0,9
1,2,0,9
2,3,5,9


In [14]:
(df_bool_reduction > 0).all()

A     True
B    False
C     True
dtype: bool

In [15]:
(df_bool_reduction > 0).any()

A    True
B    True
C    True
dtype: bool

## Descriptive statistics

### Index of min/max values

- idxmin = it return index of first occurrence of minimum over requested axis.
- idxmax = it return index of first occurrence of maximum over requested axis.

In [18]:
df_max_min = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
df_max_min

Unnamed: 0,A,B,C
0,0.228394,0.578994,-0.361929
1,-1.101265,-0.881872,0.468541
2,-0.934096,-0.72348,1.503736
3,-0.190544,0.089454,0.002275
4,2.111255,0.043604,-0.505388


In [22]:
df_max_min.idxmax(axis=0)

A    4
B    0
C    2
dtype: int64

In [24]:
df_max_min.idxmin(axis=1)

0    C
1    A
2    A
3    A
4    C
dtype: object