# panda

[official docs](https://pandas.pydata.org/)


In [1]:
import pandas as pd

atlantis_df =pd.read_csv('../data/atlantis.csv')
atlantis_df

Unnamed: 0,year,population
0,2000,12400
1,2001,12800
2,2002,13800
3,2003,13600
4,2004,14200
5,2005,15600
6,2006,17600
7,2007,19200
8,2008,20300
9,2009,20800


In [2]:
atlantis_df.head()


Unnamed: 0,year,population
0,2000,12400
1,2001,12800
2,2002,13800
3,2003,13600
4,2004,14200


In [3]:

atlantis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   year        22 non-null     int64
 1   population  22 non-null     int64
dtypes: int64(2)
memory usage: 480.0 bytes


In [4]:
atlantis_df.dtypes

year          int64
population    int64
dtype: object

In [5]:
atlantis_df.describe()

Unnamed: 0,year,population
count,22.0,22.0
mean,2010.5,22418.181818
std,6.493587,7100.411548
min,2000.0,12400.0
25%,2005.25,16100.0
50%,2010.5,21800.0
75%,2015.75,27750.0
max,2021.0,33800.0


In [6]:
atlantis_df.shape

(22, 2)

In [7]:
atlantis_df.columns

Index(['year', 'population'], dtype='object')

In [8]:
atlantis_df['year'].head()

0    2000
1    2001
2    2002
3    2003
4    2004
Name: year, dtype: int64

In [9]:
new_df = pd.DataFrame({
    'year': atlantis_df['year'],
    'population': atlantis_df['population'],
    'remarks': 1
})
new_df.head()

Unnamed: 0,year,population,remarks
0,2000,12400,1
1,2001,12800,1
2,2002,13800,1
3,2003,13600,1
4,2004,14200,1


In [10]:
new_df[['year', 'population']].head()

Unnamed: 0,year,population
0,2000,12400
1,2001,12800
2,2002,13800
3,2003,13600
4,2004,14200


## query lines by loc or iloc

In [11]:
atlantis_df.loc[0]



year           2000
population    12400
Name: 0, dtype: int64

In [12]:
atlantis_df.loc[0, 'year']

2000

In [13]:
atlantis_df.iloc[0]

year           2000
population    12400
Name: 0, dtype: int64

In [14]:
atlantis_df.iloc[0, 0]

2000

## query by condition

In [15]:
atlantis_df[atlantis_df['population'] > 30000]

Unnamed: 0,year,population
18,2018,32100
19,2019,32500
20,2020,33200
21,2021,33800


In [16]:
atlantis_df[(atlantis_df['population'] > 30000) & (atlantis_df['year'] <= 2020)]

Unnamed: 0,year,population
18,2018,32100
19,2019,32500
20,2020,33200


## modify line

In [17]:
atlantis_df.loc[20, 'population'] = 20000
atlantis_df.loc[20]

year           2020
population    20000
Name: 20, dtype: int64

## insert column

In [18]:
new_df.head()

Unnamed: 0,year,population,remarks
0,2000,12400,1
1,2001,12800,1
2,2002,13800,1
3,2003,13600,1
4,2004,14200,1


In [19]:
new_df['remarks1'] = ['High' if pop > 30000 else 'Low' for pop in new_df['population']]
new_df.head()

Unnamed: 0,year,population,remarks,remarks1
0,2000,12400,1,Low
1,2001,12800,1,Low
2,2002,13800,1,Low
3,2003,13600,1,Low
4,2004,14200,1,Low


## add new row

In [22]:
# .iloc is strictly positional and cannot create new rows.
# new_df.iloc[22] = [2022, 45000, 1, 'High']

# use loc to create new rows
new_df.loc[22] = [2022, 45000, 1, 'High']
new_df.tail()


Unnamed: 0,year,population,remarks,remarks1
18,2018,32100,1,High
19,2019,32500,1,High
20,2020,33200,1,High
21,2021,33800,1,High
22,2022,45000,1,High


In [None]:
## best practice for pipelines
new_row = pd.DataFrame([[2023, 46000, 1, 'High']], columns=new_df.columns)
pd.concat([new_df, new_row], ignore_index=True).tail()

Unnamed: 0,year,population,remarks,remarks1
19,2019,32500,1,High
20,2020,33200,1,High
21,2021,33800,1,High
22,2022,45000,1,High
23,2023,46000,1,High


## drop column or line

In [None]:
new_df.drop(columns=['remarks']).head()

In [None]:
new_df.drop(index=0)
new_df.describe()

## handling na values

In [None]:
new_df.isna().head()

In [None]:
new_df.dropna(inplace=True)
new_df.head()

In [None]:
new_df['na_col'] = [None for _ in range(len(new_df))]
new_df.head()


In [None]:
print(new_df.fillna(0).head())

In [None]:
# Fill NA/NaN values by propagating the last valid observation to next valid.
# https://pandas.pydata.org/pandas-docs/version/2.1/reference/api/pandas.DataFrame.ffill.html#pandas.DataFrame.ffill
print(new_df.ffill().head())

## order 

In [None]:
new_df.sort_values(by='population', ascending=False).head()

In [None]:
new_df.drop_duplicates().head()

In [None]:
new_df.groupby('remarks1').agg({'population': 'mean'})

## apply and map

In [None]:
new_df['population'] = new_df['population'].map(lambda x: x + 1000)
new_df.dtypes

In [None]:
new_df.apply(lambda row: row['population'] * 2 if row['remarks1'] == 'High' else row['population'], axis=1).tail()

In [None]:
new_df['remarks1'].str.upper().head()


In [None]:
new_df['remarks1'].str.contains('HIGH').head()


In [None]:
new_df['remarks1'].str.replace('HIGH', 'VERY HIGH').tail()

## time 

In [None]:
new_df['date'] = pd.to_datetime(new_df['year'], format='%Y')
new_df.head()

In [None]:
new_df['real_year'] = new_df['date'].dt.year
new_df.head()

In [None]:
new_df['real_month'] = new_df['date'].dt.month
new_df.head()

## save to files

In [None]:
new_df.to_csv('../data/atlantis_modified.csv', index=False)
new_df.to_json('../data/atlantis_modified.json')
new_df.to_excel('../data/atlantis_modified.xlsx', index=False)

## some field in DataFrame

[DataFrame](https://pandas.pydata.org/pandas-docs/version/2.1/reference/frame.html)

In [27]:
new_df.values

array([[2000, 12400, 1, 'Low'],
       [2001, 12800, 1, 'Low'],
       [2002, 13800, 1, 'Low'],
       [2003, 13600, 1, 'Low'],
       [2004, 14200, 1, 'Low'],
       [2005, 15600, 1, 'Low'],
       [2006, 17600, 1, 'Low'],
       [2007, 19200, 1, 'Low'],
       [2008, 20300, 1, 'Low'],
       [2009, 20800, 1, 'Low'],
       [2010, 21200, 1, 'Low'],
       [2011, 22400, 1, 'Low'],
       [2012, 23400, 1, 'Low'],
       [2013, 24500, 1, 'Low'],
       [2014, 25800, 1, 'Low'],
       [2015, 26100, 1, 'Low'],
       [2016, 28300, 1, 'Low'],
       [2017, 29600, 1, 'Low'],
       [2018, 32100, 1, 'High'],
       [2019, 32500, 1, 'High'],
       [2020, 33200, 1, 'High'],
       [2021, 33800, 1, 'High'],
       [2022, 45000, 1, 'High']], dtype=object)

In [28]:
new_df.axes

[Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22],
       dtype='int64'),
 Index(['year', 'population', 'remarks', 'remarks1'], dtype='object')]

In [29]:
new_df.ndim

2

In [32]:
new_df.size

92

In [None]:
# (rows, columns)
new_df.shape

(23, 4)

In [49]:
new_df.memory_usage(True, False)
# Index         740

Index         740
year          184
population    184
remarks       184
remarks1      184
dtype: int64

In [50]:
new_df.memory_usage(True, True)


Index          740
year           184
population     184
remarks        184
remarks1      1385
dtype: int64

In [51]:
new_df.empty

False