# pandas in  10 minutes
- this is short introduction of pandas and for more complex study [Cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#cookbook)


In [1]:
# import library as 
import numpy as np
import pandas as pd

## Basic data sturcture in pandas
- pandasprovides two type of classes for handling data:
1. Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc
2. DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

In [2]:
dates = pd.date_range('20140101',periods=10)
# Creates a DatetimeIndex of 10 sequential dates starting from '2014-01-01', used to label the rows (index) in a DataFrame.
dates

DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
               '2014-01-05', '2014-01-06', '2014-01-07', '2014-01-08',
               '2014-01-09', '2014-01-10'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df = pd.DataFrame(np.random.randn(10,4), index=dates, columns=list('ABCD'))
# A DataFrame with 10 rows (indexed by dates), 4 columns labeled 'A', 'B', 'C', 'D', filled with random values.
df

Unnamed: 0,A,B,C,D
2014-01-01,1.569233,-0.678559,0.06705,0.819036
2014-01-02,-0.481405,-0.192512,0.947463,0.80632
2014-01-03,-0.864873,-1.100285,-0.093128,0.494835
2014-01-04,-0.198615,0.34425,-1.704664,1.265533
2014-01-05,0.138431,0.729905,-2.103799,-0.38012
2014-01-06,0.024959,0.416609,1.561498,-0.088178
2014-01-07,-0.32438,-0.401447,-0.039363,-1.063489
2014-01-08,-1.844305,0.490536,1.218289,-0.304004
2014-01-09,-0.092354,-0.800876,0.492006,-1.14021
2014-01-10,0.623552,0.286183,1.184754,0.574648


#### Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [4]:
df2 = pd.DataFrame({
    'A': 1.0,
    'B':pd.Timestamp('20130202'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*4, dtype='int32'),
    'E': pd.Categorical(['train','test','train','test']),
    'F': 'foo' 
}
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-02-02,1.0,3,train,foo
1,1.0,2013-02-02,1.0,3,test,foo
2,1.0,2013-02-02,1.0,3,train,foo
3,1.0,2013-02-02,1.0,3,test,foo


- columns of dataframe has different dtype 

In [5]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

## Viewing data
- use dataframe.head() and dataframe.tail() , we can see first and last five rows of data.

In [6]:
df.head()  # first or top five rows of data show in output

Unnamed: 0,A,B,C,D
2014-01-01,1.569233,-0.678559,0.06705,0.819036
2014-01-02,-0.481405,-0.192512,0.947463,0.80632
2014-01-03,-0.864873,-1.100285,-0.093128,0.494835
2014-01-04,-0.198615,0.34425,-1.704664,1.265533
2014-01-05,0.138431,0.729905,-2.103799,-0.38012


In [7]:
df.tail()  # last or bottom five rows of data show in output.

Unnamed: 0,A,B,C,D
2014-01-06,0.024959,0.416609,1.561498,-0.088178
2014-01-07,-0.32438,-0.401447,-0.039363,-1.063489
2014-01-08,-1.844305,0.490536,1.218289,-0.304004
2014-01-09,-0.092354,-0.800876,0.492006,-1.14021
2014-01-10,0.623552,0.286183,1.184754,0.574648


- displaying dataframe.index or dataframe. columns means that show all columns name in dataset.

In [8]:
df.index
# df.columns

DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
               '2014-01-05', '2014-01-06', '2014-01-07', '2014-01-08',
               '2014-01-09', '2014-01-10'],
              dtype='datetime64[ns]', freq='D')

- Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:

In [9]:
df.to_numpy()

array([[ 1.56923275, -0.67855872,  0.06705007,  0.81903623],
       [-0.48140508, -0.19251191,  0.94746284,  0.80631952],
       [-0.86487309, -1.10028492, -0.09312838,  0.49483545],
       [-0.19861477,  0.34424954, -1.70466447,  1.26553349],
       [ 0.13843142,  0.7299049 , -2.10379851, -0.38011958],
       [ 0.02495902,  0.41660904,  1.56149777, -0.08817842],
       [-0.32438039, -0.40144735, -0.03936315, -1.06348899],
       [-1.84430481,  0.49053585,  1.21828917, -0.30400428],
       [-0.09235413, -0.80087643,  0.4920063 , -1.14020981],
       [ 0.62355243,  0.28618318,  1.18475423,  0.574648  ]])

### Note:
- NumPy arrays have one dtype for the entire array while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. If the common data type is object, DataFrame.to_numpy() will require copying data.

In [10]:
print(df2.dtypes)
print("-------------------------")
print(df2.to_numpy())

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object
-------------------------
[[1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'test' 'foo']]


- use ``` describe() ``` for short statistics summary. but give only numeric columns output.

In [11]:
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,-0.144976,-0.09062,0.153011,0.098437
std,0.894137,0.630338,1.228294,0.821685
min,-1.844305,-1.100285,-2.103799,-1.14021
25%,-0.442149,-0.609281,-0.079687,-0.361091
50%,-0.145484,0.046836,0.279528,0.203329
75%,0.110063,0.398519,1.125431,0.748402
max,1.569233,0.729905,1.561498,1.265533


- we can take trnasform of the data as mean row change in columns.

In [12]:
# df.T

- use ``` sort_index()``` for sort data as ascending or decending order.

In [13]:
df.sort_index(axis=1, ascending=False) # Sorts the column names in descending order (C, B, A) and rearranges the DataFrame accordingly.
# Sorts the index (row labels if axis=0, column labels if axis=1) of the DataFrame.
# ascending=False= Sorts in descending order (Z to A, or highest to lowest)., Use True for ascending (A to Z).

Unnamed: 0,D,C,B,A
2014-01-01,0.819036,0.06705,-0.678559,1.569233
2014-01-02,0.80632,0.947463,-0.192512,-0.481405
2014-01-03,0.494835,-0.093128,-1.100285,-0.864873
2014-01-04,1.265533,-1.704664,0.34425,-0.198615
2014-01-05,-0.38012,-2.103799,0.729905,0.138431
2014-01-06,-0.088178,1.561498,0.416609,0.024959
2014-01-07,-1.063489,-0.039363,-0.401447,-0.32438
2014-01-08,-0.304004,1.218289,0.490536,-1.844305
2014-01-09,-1.14021,0.492006,-0.800876,-0.092354
2014-01-10,0.574648,1.184754,0.286183,0.623552


- use ```sort_value()``` sort by values

In [14]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2014-01-03,-0.864873,-1.100285,-0.093128,0.494835
2014-01-09,-0.092354,-0.800876,0.492006,-1.14021
2014-01-01,1.569233,-0.678559,0.06705,0.819036
2014-01-07,-0.32438,-0.401447,-0.039363,-1.063489
2014-01-02,-0.481405,-0.192512,0.947463,0.80632
2014-01-10,0.623552,0.286183,1.184754,0.574648
2014-01-04,-0.198615,0.34425,-1.704664,1.265533
2014-01-06,0.024959,0.416609,1.561498,-0.088178
2014-01-08,-1.844305,0.490536,1.218289,-0.304004
2014-01-05,0.138431,0.729905,-2.103799,-0.38012


# Selection
### Note
- While standard Python / NumPy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, DataFrame.at(), DataFrame.iat(), DataFrame.loc() and DataFrame.iloc().
#### Getitem([])
- For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A:

In [15]:
df['A']

2014-01-01    1.569233
2014-01-02   -0.481405
2014-01-03   -0.864873
2014-01-04   -0.198615
2014-01-05    0.138431
2014-01-06    0.024959
2014-01-07   -0.324380
2014-01-08   -1.844305
2014-01-09   -0.092354
2014-01-10    0.623552
Freq: D, Name: A, dtype: float64

- For a DataFrame, passing a slice : selects matching rows:

In [16]:
df[0:3] # Just like my_list[0:3] gives the first 3 items(rows).

Unnamed: 0,A,B,C,D
2014-01-01,1.569233,-0.678559,0.06705,0.819036
2014-01-02,-0.481405,-0.192512,0.947463,0.80632
2014-01-03,-0.864873,-1.100285,-0.093128,0.494835


#### Selection by label
- Selecting a row matching a label:

In [17]:
df.loc[dates[0]]  # Selects the row in DataFrame df that has the index equal to dates[0].
# .loc[]:Accesses data by label (not position)., dates[0] is the first date in your datetime index.

A    1.569233
B   -0.678559
C    0.067050
D    0.819036
Name: 2014-01-01 00:00:00, dtype: float64

- Selecting all rows (:) with a select column labels:

In [18]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2014-01-01,1.569233,-0.678559
2014-01-02,-0.481405,-0.192512
2014-01-03,-0.864873,-1.100285
2014-01-04,-0.198615,0.34425
2014-01-05,0.138431,0.729905
2014-01-06,0.024959,0.416609
2014-01-07,-0.32438,-0.401447
2014-01-08,-1.844305,0.490536
2014-01-09,-0.092354,-0.800876
2014-01-10,0.623552,0.286183


- For label slicing, both endpoints are included:

In [19]:
df.loc['20140101':'20140104',['A','B']]

Unnamed: 0,A,B
2014-01-01,1.569233,-0.678559
2014-01-02,-0.481405,-0.192512
2014-01-03,-0.864873,-1.100285
2014-01-04,-0.198615,0.34425


- Selecting a single row and column label returns a scalar:

In [20]:
# print(df.loc[dates[0],'A'])
df.loc[dates[0],'A']

np.float64(1.5692327537268844)

- For getting fast access to a scalar (equivalent to the prior method):

In [21]:
df.at[dates[0], 'A']

np.float64(1.5692327537268844)

#### Selection by position
- Select via the position of the passed integers:

In [22]:
df.iloc[3]  # Selects the 4th row (index position 3) from the DataFrame df using integer-based indexing.
#  Returns the row at position 3 (i.e., the 4th row).
# .iloc[]:= Stands for integer-location., Used to access rows/columns by position, not by label.
# 

A   -0.198615
B    0.344250
C   -1.704664
D    1.265533
Name: 2014-01-04 00:00:00, dtype: float64

- Integer slices acts similar to NumPy/Python:

In [23]:
df.iloc[3:5, 0:2]  # Selects rows 3 and 4 (index positions 3 to 4) and columns 0 and 1 from DataFrame df, using integer-based indexing.
# 3:5 → rows at positions 3 and 4 (excludes 5)., 0:2 → columns at positions 0 and 1 (excludes 2).

Unnamed: 0,A,B
2014-01-04,-0.198615,0.34425
2014-01-05,0.138431,0.729905


- Lists of integer position locations:

In [24]:
df.iloc[[1,2,3], [0,2]]  # Selects rows at positions 1, 2, 3 and columns at positions 0 and 2 from the DataFrame df using integer-based indexing.
# rows → [1, 2, 3] (2nd, 3rd, and 4th rows), columns → [0, 2] (1st and 3rd columns)

Unnamed: 0,A,C
2014-01-02,-0.481405,0.947463
2014-01-03,-0.864873,-0.093128
2014-01-04,-0.198615,-1.704664


- For slicing rows explicitly:

In [25]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2014-01-02,-0.481405,-0.192512,0.947463,0.80632
2014-01-03,-0.864873,-1.100285,-0.093128,0.494835


- For slicing columns explicitly:

In [26]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2014-01-01,-0.678559,0.06705
2014-01-02,-0.192512,0.947463
2014-01-03,-1.100285,-0.093128
2014-01-04,0.34425,-1.704664
2014-01-05,0.729905,-2.103799
2014-01-06,0.416609,1.561498
2014-01-07,-0.401447,-0.039363
2014-01-08,0.490536,1.218289
2014-01-09,-0.800876,0.492006
2014-01-10,0.286183,1.184754


- For getting a value explicitly:

In [27]:
df.iloc[1,1]

np.float64(-0.1925119146456864)

- For getting fast access to a scalar (equivalent to the prior method):

In [28]:
df.iat[1,1]

np.float64(-0.1925119146456864)

## Boolean indexing
- Select rows where df.A is greater than 0.

In [29]:
df[df['A']>2] # Returns only the rows where column 'A' has values greater than 2.
#  df['A'] > 2 → creates a boolean Series (True/False for each row)., df[...] → filters rows where condition is True.

Unnamed: 0,A,B,C,D


- Selecting values from a DataFrame where a boolean condition is met:

In [30]:
df[df>0]

Unnamed: 0,A,B,C,D
2014-01-01,1.569233,,0.06705,0.819036
2014-01-02,,,0.947463,0.80632
2014-01-03,,,,0.494835
2014-01-04,,0.34425,,1.265533
2014-01-05,0.138431,0.729905,,
2014-01-06,0.024959,0.416609,1.561498,
2014-01-07,,,,
2014-01-08,,0.490536,1.218289,
2014-01-09,,,0.492006,
2014-01-10,0.623552,0.286183,1.184754,0.574648


- Using isin() method for filtering:

In [31]:
df2= df.copy()
df2['E'] = ['one', 'one', 'two','three','four','five','six','seven','eight','nine']  # add column
df2

Unnamed: 0,A,B,C,D,E
2014-01-01,1.569233,-0.678559,0.06705,0.819036,one
2014-01-02,-0.481405,-0.192512,0.947463,0.80632,one
2014-01-03,-0.864873,-1.100285,-0.093128,0.494835,two
2014-01-04,-0.198615,0.34425,-1.704664,1.265533,three
2014-01-05,0.138431,0.729905,-2.103799,-0.38012,four
2014-01-06,0.024959,0.416609,1.561498,-0.088178,five
2014-01-07,-0.32438,-0.401447,-0.039363,-1.063489,six
2014-01-08,-1.844305,0.490536,1.218289,-0.304004,seven
2014-01-09,-0.092354,-0.800876,0.492006,-1.14021,eight
2014-01-10,0.623552,0.286183,1.184754,0.574648,nine


In [32]:
df2[df2['E'].isin(['one', 'nine'])]

Unnamed: 0,A,B,C,D,E
2014-01-01,1.569233,-0.678559,0.06705,0.819036,one
2014-01-02,-0.481405,-0.192512,0.947463,0.80632,one
2014-01-10,0.623552,0.286183,1.184754,0.574648,nine


#### Setting
- Setting a new column automatically aligns the data by the indexes:

In [33]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20140101', periods=6))
s1

2014-01-01    1
2014-01-02    2
2014-01-03    3
2014-01-04    4
2014-01-05    5
2014-01-06    6
Freq: D, dtype: int64

In [34]:
df['F'] = s1

- Setting values by label:

In [35]:
df.at[dates[0], 'A'] = 0

- Setting values by position:

In [36]:
df.iat[1,2] = 0

- Setting by assigning with a NumPy array:

In [37]:
df.loc[:, 'D']  = np.array([5] * len(df))


- The result of the prior setting operations:

In [38]:
df

Unnamed: 0,A,B,C,D,F
2014-01-01,0.0,-0.678559,0.06705,5.0,1.0
2014-01-02,-0.481405,-0.192512,0.0,5.0,2.0
2014-01-03,-0.864873,-1.100285,-0.093128,5.0,3.0
2014-01-04,-0.198615,0.34425,-1.704664,5.0,4.0
2014-01-05,0.138431,0.729905,-2.103799,5.0,5.0
2014-01-06,0.024959,0.416609,1.561498,5.0,6.0
2014-01-07,-0.32438,-0.401447,-0.039363,5.0,
2014-01-08,-1.844305,0.490536,1.218289,5.0,
2014-01-09,-0.092354,-0.800876,0.492006,5.0,
2014-01-10,0.623552,0.286183,1.184754,5.0,


- A where operation with setting:

In [39]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2014-01-01,0.0,-0.678559,-0.06705,-5.0,-1.0
2014-01-02,-0.481405,-0.192512,0.0,-5.0,-2.0
2014-01-03,-0.864873,-1.100285,-0.093128,-5.0,-3.0
2014-01-04,-0.198615,-0.34425,-1.704664,-5.0,-4.0
2014-01-05,-0.138431,-0.729905,-2.103799,-5.0,-5.0
2014-01-06,-0.024959,-0.416609,-1.561498,-5.0,-6.0
2014-01-07,-0.32438,-0.401447,-0.039363,-5.0,
2014-01-08,-1.844305,-0.490536,-1.218289,-5.0,
2014-01-09,-0.092354,-0.800876,-0.492006,-5.0,
2014-01-10,-0.623552,-0.286183,-1.184754,-5.0,
