# pandas in  10 minutes
- this is short introduction of pandas and for more complex study [Cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#cookbook)


In [37]:
# import library as 
import numpy as np
import pandas as pd

## BAsic data sturcture in pandas
- pandasprovides two type of classes for handling data:
1. Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc
2. DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

In [38]:
dates = pd.date_range('20140101',periods=10)
# Creates a DatetimeIndex of 10 sequential dates starting from '2014-01-01', used to label the rows (index) in a DataFrame.
dates

DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
               '2014-01-05', '2014-01-06', '2014-01-07', '2014-01-08',
               '2014-01-09', '2014-01-10'],
              dtype='datetime64[ns]', freq='D')

In [39]:
df = pd.DataFrame(np.random.randn(10,4), index=dates, columns=list('ABCD'))
# A DataFrame with 10 rows (indexed by dates), 4 columns labeled 'A', 'B', 'C', 'D', filled with random values.
df

Unnamed: 0,A,B,C,D
2014-01-01,1.187559,0.482007,-0.122165,-0.037711
2014-01-02,1.002335,1.121448,-1.068504,-0.341395
2014-01-03,0.396296,-0.281746,0.636223,-0.473842
2014-01-04,0.283642,-0.029515,1.27047,-1.58127
2014-01-05,-0.403508,-0.396598,-1.759281,0.331228
2014-01-06,0.0999,0.655618,-0.559479,0.440636
2014-01-07,2.297743,-0.437457,-0.108546,-0.157699
2014-01-08,0.289907,2.018889,-1.379746,0.842604
2014-01-09,1.057921,0.855279,0.590098,-0.599102
2014-01-10,2.494401,-0.094579,0.370613,-0.200282


#### Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [40]:
df2 = pd.DataFrame({
    'A': 1.0,
    'B':pd.Timestamp('20130202'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*4, dtype='int32'),
    'E': pd.Categorical(['train','test','train','test']),
    'F': 'foo' 
}
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-02-02,1.0,3,train,foo
1,1.0,2013-02-02,1.0,3,test,foo
2,1.0,2013-02-02,1.0,3,train,foo
3,1.0,2013-02-02,1.0,3,test,foo


- columns of dataframe has different dtype 

In [41]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

## Viewing data
- use dataframe.head() and dataframe.tail() , we can see first and last five rows of data.

In [42]:
df.head()  # first or top five rows of data show in output

Unnamed: 0,A,B,C,D
2014-01-01,1.187559,0.482007,-0.122165,-0.037711
2014-01-02,1.002335,1.121448,-1.068504,-0.341395
2014-01-03,0.396296,-0.281746,0.636223,-0.473842
2014-01-04,0.283642,-0.029515,1.27047,-1.58127
2014-01-05,-0.403508,-0.396598,-1.759281,0.331228


In [43]:
df.tail()  # last or bottom five rows of data show in output.

Unnamed: 0,A,B,C,D
2014-01-06,0.0999,0.655618,-0.559479,0.440636
2014-01-07,2.297743,-0.437457,-0.108546,-0.157699
2014-01-08,0.289907,2.018889,-1.379746,0.842604
2014-01-09,1.057921,0.855279,0.590098,-0.599102
2014-01-10,2.494401,-0.094579,0.370613,-0.200282


- displaying dataframe.index or dataframe. columns means that show all columns name in dataset.

In [44]:
df.index
# df.columns

DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
               '2014-01-05', '2014-01-06', '2014-01-07', '2014-01-08',
               '2014-01-09', '2014-01-10'],
              dtype='datetime64[ns]', freq='D')

- Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:

In [45]:
df.to_numpy()

array([[ 1.18755862,  0.48200658, -0.12216533, -0.03771105],
       [ 1.00233536,  1.12144785, -1.06850375, -0.34139516],
       [ 0.39629598, -0.28174628,  0.63622259, -0.47384193],
       [ 0.28364198, -0.02951529,  1.27046975, -1.58126977],
       [-0.40350762, -0.39659848, -1.75928109,  0.33122811],
       [ 0.0999002 ,  0.6556181 , -0.55947874,  0.44063567],
       [ 2.29774314, -0.43745675, -0.10854591, -0.15769936],
       [ 0.28990719,  2.01888932, -1.37974585,  0.84260361],
       [ 1.05792119,  0.85527892,  0.5900983 , -0.59910163],
       [ 2.49440088, -0.09457942,  0.37061299, -0.20028188]])

### Note:
- NumPy arrays have one dtype for the entire array while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. If the common data type is object, DataFrame.to_numpy() will require copying data.

In [46]:
print(df2.dtypes)
print("-------------------------")
print(df2.to_numpy())

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object
-------------------------
[[1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-02-02 00:00:00') 1.0 3 'test' 'foo']]


- use ``` describe() ``` for short statistics summary. but give only numeric columns output.

In [47]:
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.87062,0.389334,-0.213032,-0.177683
std,0.939881,0.792273,0.973585,0.66374
min,-0.403508,-0.437457,-1.759281,-1.58127
25%,0.285208,-0.234955,-0.941247,-0.44073
50%,0.699316,0.226246,-0.115356,-0.178991
75%,1.155149,0.805364,0.535227,0.238993
max,2.494401,2.018889,1.27047,0.842604


- we can take trnasform of the data as mean row change in columns.

In [48]:
# df.T

- use ``` sort_index()``` for sort data as ascending or decending order.

In [49]:
df.sort_index(axis=1, ascending=False) # Sorts the column names in descending order (C, B, A) and rearranges the DataFrame accordingly.
# Sorts the index (row labels if axis=0, column labels if axis=1) of the DataFrame.
# ascending=False= Sorts in descending order (Z to A, or highest to lowest)., Use True for ascending (A to Z).

Unnamed: 0,D,C,B,A
2014-01-01,-0.037711,-0.122165,0.482007,1.187559
2014-01-02,-0.341395,-1.068504,1.121448,1.002335
2014-01-03,-0.473842,0.636223,-0.281746,0.396296
2014-01-04,-1.58127,1.27047,-0.029515,0.283642
2014-01-05,0.331228,-1.759281,-0.396598,-0.403508
2014-01-06,0.440636,-0.559479,0.655618,0.0999
2014-01-07,-0.157699,-0.108546,-0.437457,2.297743
2014-01-08,0.842604,-1.379746,2.018889,0.289907
2014-01-09,-0.599102,0.590098,0.855279,1.057921
2014-01-10,-0.200282,0.370613,-0.094579,2.494401


- use ```sort_value()``` sort by values

In [50]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2014-01-07,2.297743,-0.437457,-0.108546,-0.157699
2014-01-05,-0.403508,-0.396598,-1.759281,0.331228
2014-01-03,0.396296,-0.281746,0.636223,-0.473842
2014-01-10,2.494401,-0.094579,0.370613,-0.200282
2014-01-04,0.283642,-0.029515,1.27047,-1.58127
2014-01-01,1.187559,0.482007,-0.122165,-0.037711
2014-01-06,0.0999,0.655618,-0.559479,0.440636
2014-01-09,1.057921,0.855279,0.590098,-0.599102
2014-01-02,1.002335,1.121448,-1.068504,-0.341395
2014-01-08,0.289907,2.018889,-1.379746,0.842604


# Selection
### Note
- While standard Python / NumPy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, DataFrame.at(), DataFrame.iat(), DataFrame.loc() and DataFrame.iloc().
#### Getitem([])
- For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A:

In [51]:
df['A']

2014-01-01    1.187559
2014-01-02    1.002335
2014-01-03    0.396296
2014-01-04    0.283642
2014-01-05   -0.403508
2014-01-06    0.099900
2014-01-07    2.297743
2014-01-08    0.289907
2014-01-09    1.057921
2014-01-10    2.494401
Freq: D, Name: A, dtype: float64

- For a DataFrame, passing a slice : selects matching rows:

In [52]:
df[0:3] # Just like my_list[0:3] gives the first 3 items(rows).

Unnamed: 0,A,B,C,D
2014-01-01,1.187559,0.482007,-0.122165,-0.037711
2014-01-02,1.002335,1.121448,-1.068504,-0.341395
2014-01-03,0.396296,-0.281746,0.636223,-0.473842


#### Selection by label
- Selecting a row matching a label:

In [60]:
df.loc[dates[0]]  # Selects the row in DataFrame df that has the index equal to dates[0].
# .loc[]:Accesses data by label (not position)., dates[0] is the first date in your datetime index.

A    1.187559
B    0.482007
C   -0.122165
D   -0.037711
Name: 2014-01-01 00:00:00, dtype: float64

- Selecting all rows (:) with a select column labels:

In [61]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2014-01-01,1.187559,0.482007
2014-01-02,1.002335,1.121448
2014-01-03,0.396296,-0.281746
2014-01-04,0.283642,-0.029515
2014-01-05,-0.403508,-0.396598
2014-01-06,0.0999,0.655618
2014-01-07,2.297743,-0.437457
2014-01-08,0.289907,2.018889
2014-01-09,1.057921,0.855279
2014-01-10,2.494401,-0.094579


- For label slicing, both endpoints are included:

In [63]:
df.loc['20140101':'20140104',['A','B']]

Unnamed: 0,A,B
2014-01-01,1.187559,0.482007
2014-01-02,1.002335,1.121448
2014-01-03,0.396296,-0.281746
2014-01-04,0.283642,-0.029515


- Selecting a single row and column label returns a scalar:

In [66]:
# print(df.loc[dates[0],'A'])
df.loc[dates[0],'A']

np.float64(1.1875586204428668)

- For getting fast access to a scalar (equivalent to the prior method):

In [67]:
df.at[dates[0], 'A']

np.float64(1.1875586204428668)

#### Selection by position
- Select via the position of the passed integers:

In [69]:
df.iloc[3]  # Selects the 4th row (index position 3) from the DataFrame df using integer-based indexing.
#  Returns the row at position 3 (i.e., the 4th row).
# .iloc[]:= Stands for integer-location., Used to access rows/columns by position, not by label.
# 

A    0.283642
B   -0.029515
C    1.270470
D   -1.581270
Name: 2014-01-04 00:00:00, dtype: float64

- Integer slices acts similar to NumPy/Python:

In [None]:
df.iloc[3:5, 0:2]  # Selects rows 3 and 4 (index positions 3 to 4) and columns 0 and 1 from DataFrame df, using integer-based indexing.
# 3:5 → rows at positions 3 and 4 (excludes 5)., 0:2 → columns at positions 0 and 1 (excludes 2).

Unnamed: 0,A,B
2014-01-04,0.283642,-0.029515
2014-01-05,-0.403508,-0.396598


- Lists of integer position locations:

In [72]:
df.iloc[[1,2,3], [0,2]]  # Selects rows at positions 1, 2, 3 and columns at positions 0 and 2 from the DataFrame df using integer-based indexing.
# rows → [1, 2, 3] (2nd, 3rd, and 4th rows), columns → [0, 2] (1st and 3rd columns)

Unnamed: 0,A,C
2014-01-02,1.002335,-1.068504
2014-01-03,0.396296,0.636223
2014-01-04,0.283642,1.27047


- For slicing rows explicitly:

In [73]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2014-01-02,1.002335,1.121448,-1.068504,-0.341395
2014-01-03,0.396296,-0.281746,0.636223,-0.473842


- For slicing columns explicitly:

In [74]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2014-01-01,0.482007,-0.122165
2014-01-02,1.121448,-1.068504
2014-01-03,-0.281746,0.636223
2014-01-04,-0.029515,1.27047
2014-01-05,-0.396598,-1.759281
2014-01-06,0.655618,-0.559479
2014-01-07,-0.437457,-0.108546
2014-01-08,2.018889,-1.379746
2014-01-09,0.855279,0.590098
2014-01-10,-0.094579,0.370613


- For getting a value explicitly:

In [75]:
df.iloc[1,1]

np.float64(1.1214478478964671)

- For getting fast access to a scalar (equivalent to the prior method):

In [76]:
df.iat[1,1]

np.float64(1.1214478478964671)

## Boolean indexing
- Select rows where df.A is greater than 0.

In [None]:
df[df['A']>2] # Returns only the rows where column 'A' has values greater than 2.
#  df['A'] > 2 → creates a boolean Series (True/False for each row)., df[...] → filters rows where condition is True.

Unnamed: 0,A,B,C,D
2014-01-07,2.297743,-0.437457,-0.108546,-0.157699
2014-01-10,2.494401,-0.094579,0.370613,-0.200282


- Selecting values from a DataFrame where a boolean condition is met:

In [79]:
df[df>0]

Unnamed: 0,A,B,C,D
2014-01-01,1.187559,0.482007,,
2014-01-02,1.002335,1.121448,,
2014-01-03,0.396296,,0.636223,
2014-01-04,0.283642,,1.27047,
2014-01-05,,,,0.331228
2014-01-06,0.0999,0.655618,,0.440636
2014-01-07,2.297743,,,
2014-01-08,0.289907,2.018889,,0.842604
2014-01-09,1.057921,0.855279,0.590098,
2014-01-10,2.494401,,0.370613,


- Using isin() method for filtering:

In [None]:
df2= df.copy()
df2['E'] = ['one', 'one', 'two','three','four','five','six','seven','eight','nine']  # add column
df2

Unnamed: 0,A,B,C,D,E
2014-01-01,1.187559,0.482007,-0.122165,-0.037711,one
2014-01-02,1.002335,1.121448,-1.068504,-0.341395,one
2014-01-03,0.396296,-0.281746,0.636223,-0.473842,two
2014-01-04,0.283642,-0.029515,1.27047,-1.58127,three
2014-01-05,-0.403508,-0.396598,-1.759281,0.331228,four
2014-01-06,0.0999,0.655618,-0.559479,0.440636,five
2014-01-07,2.297743,-0.437457,-0.108546,-0.157699,six
2014-01-08,0.289907,2.018889,-1.379746,0.842604,seven
2014-01-09,1.057921,0.855279,0.590098,-0.599102,eight
2014-01-10,2.494401,-0.094579,0.370613,-0.200282,nine


In [88]:
df2[df2['E'].isin(['one', 'nine'])]

Unnamed: 0,A,B,C,D,E
2014-01-01,1.187559,0.482007,-0.122165,-0.037711,one
2014-01-02,1.002335,1.121448,-1.068504,-0.341395,one
2014-01-10,2.494401,-0.094579,0.370613,-0.200282,nine


#### Setting
- Setting a new column automatically aligns the data by the indexes:

In [90]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20140101', periods=6))
s1

2014-01-01    1
2014-01-02    2
2014-01-03    3
2014-01-04    4
2014-01-05    5
2014-01-06    6
Freq: D, dtype: int64

In [91]:
df['F'] = s1

- Setting values by label:

In [93]:
df.at[dates[0], 'A'] = 0

- Setting values by position:

In [94]:
df.iat[1,2] = 0

- Setting by assigning with a NumPy array:

In [None]:
df.loc[:, 'D']  = np.array([5] * len(df))


- The result of the prior setting operations:

In [96]:
df

Unnamed: 0,A,B,C,D,F
2014-01-01,0.0,0.482007,-0.122165,5.0,1.0
2014-01-02,1.002335,1.121448,0.0,5.0,2.0
2014-01-03,0.396296,-0.281746,0.636223,5.0,3.0
2014-01-04,0.283642,-0.029515,1.27047,5.0,4.0
2014-01-05,-0.403508,-0.396598,-1.759281,5.0,5.0
2014-01-06,0.0999,0.655618,-0.559479,5.0,6.0
2014-01-07,2.297743,-0.437457,-0.108546,5.0,
2014-01-08,0.289907,2.018889,-1.379746,5.0,
2014-01-09,1.057921,0.855279,0.590098,5.0,
2014-01-10,2.494401,-0.094579,0.370613,5.0,


- A where operation with setting:

In [None]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2014-01-01,0.0,-0.482007,-0.122165,-5.0,-1.0
2014-01-02,-1.002335,-1.121448,0.0,-5.0,-2.0
2014-01-03,-0.396296,-0.281746,-0.636223,-5.0,-3.0
2014-01-04,-0.283642,-0.029515,-1.27047,-5.0,-4.0
2014-01-05,-0.403508,-0.396598,-1.759281,-5.0,-5.0
2014-01-06,-0.0999,-0.655618,-0.559479,-5.0,-6.0
2014-01-07,-2.297743,-0.437457,-0.108546,-5.0,
2014-01-08,-0.289907,-2.018889,-1.379746,-5.0,
2014-01-09,-1.057921,-0.855279,-0.590098,-5.0,
2014-01-10,-2.494401,-0.094579,-0.370613,-5.0,
