<a href="https://colab.research.google.com/github/JakubPac/kurs_data_science/blob/main/01_pandas_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Pandas
>Strona biblioteki: [https://pandas.pydata.org/](https://pandas.pydata.org/)  
>Dokumentacja: [https://pandas.pydata.org/pandas-docs/stable/](https://pandas.pydata.org/pandas-docs/stable/)
>
>Podstawowa biblioteka do analizy danych w języku Python.
>
>Aby zainstalować bibliotekę Pandas użyj polecenia poniżej:
```
pip install pandas
```
### Spis treści:
1. [Podstawowe struktury danych: pd.Series](#a1)
2. [Podstawowe struktury danych: pd.DataFrame](#a2)
3. [Selekcja kolumn](#a3)



### <a name='a1'></a>  Podstawowe struktury danych: pd.Series

In [None]:
import pandas as pd
pd.__version__

'2.2.2'

In [None]:
s = pd.Series(data = [3, 2, 4, 6])
s

Unnamed: 0,0
0,3
1,2
2,4
3,6


In [None]:
s = pd.Series(data = [3, 2, 4, 6], index = ['a', 'b', 'c', 'd'], name = 'sample')
s

Unnamed: 0,sample
a,3
b,2
c,4
d,6


In [None]:
s = pd.Series(data = [3., 2, 4, 6], index = ['a', 'b', 'c', 'd'], name = 'sample')
s

Unnamed: 0,sample
a,3.0
b,2.0
c,4.0
d,6.0


In [None]:
import numpy as np

In [None]:
np.nan

nan

In [None]:
s = pd.Series(data = [3., np.nan, 4, 6], index = ['a', 'b', 'c', 'd'], name = 'sample')
s

Unnamed: 0,sample
a,3.0
b,
c,4.0
d,6.0


In [None]:
s = pd.Series(data = [True, False, False])
s

Unnamed: 0,0
0,True
1,False
2,False


In [None]:
s = pd.Series(data = np.arange(10, 20), index = pd.date_range('20200101', periods = 10))
s

Unnamed: 0,0
2020-01-01,10
2020-01-02,11
2020-01-03,12
2020-01-04,13
2020-01-05,14
2020-01-06,15
2020-01-07,16
2020-01-08,17
2020-01-09,18
2020-01-10,19


In [None]:
list(s.index)

[Timestamp('2020-01-01 00:00:00'),
 Timestamp('2020-01-02 00:00:00'),
 Timestamp('2020-01-03 00:00:00'),
 Timestamp('2020-01-04 00:00:00'),
 Timestamp('2020-01-05 00:00:00'),
 Timestamp('2020-01-06 00:00:00'),
 Timestamp('2020-01-07 00:00:00'),
 Timestamp('2020-01-08 00:00:00'),
 Timestamp('2020-01-09 00:00:00'),
 Timestamp('2020-01-10 00:00:00')]

In [None]:
s = pd.Series(data = ['python', 'java', 'sql'], name = 'languages')
s

Unnamed: 0,languages
0,python
1,java
2,sql


In [None]:
type(s)

In [None]:
s.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10'],
              dtype='datetime64[ns]', freq='D')

In [None]:
s.values

array(['python', 'java', 'sql'], dtype=object)

In [None]:
s.dtype

dtype('O')

In [None]:
s.shape

(3,)

In [None]:
price = pd.Series({'Apple':200, 'CDP': 60, 'Amazon':1900, 'KGHM' : np.nan})
price

Unnamed: 0,0
Apple,200.0
CDP,60.0
Amazon,1900.0
KGHM,


In [None]:
price['CDP']

np.int64(60)

In [None]:
price[1]

  price[1]


np.int64(60)

In [None]:
price.count()

np.int64(3)

In [None]:
price.value_counts(dropna=False)

Unnamed: 0,count
200.0,1
60.0,1
1900.0,1
,1


In [None]:
price.sum()

np.int64(2160)

In [None]:
price.min()

60.0

In [None]:
price.max()

1900.0

In [None]:
price.std()

1024.3046421841502

In [None]:
price.describe()

Unnamed: 0,0
count,3.0
mean,720.0
std,1024.304642
min,60.0
25%,130.0
50%,200.0
75%,1050.0
max,1900.0


In [None]:
price.nlargest(1)

Unnamed: 0,0
Amazon,1900.0


In [None]:
price.nsmallest(3)

Unnamed: 0,0
CDP,60.0
Apple,200.0
Amazon,1900.0


In [None]:
price.rank()

Unnamed: 0,0
Apple,2.0
CDP,1.0
Amazon,3.0
KGHM,


In [None]:
price.sort_values()

Unnamed: 0,0
CDP,60.0
Apple,200.0
Amazon,1900.0
KGHM,


In [None]:
price.sort_values(ascending = False)

Unnamed: 0,0
Amazon,1900.0
Apple,200.0
CDP,60.0
KGHM,


In [None]:
price_pln = price.apply(lambda x: x * 3.8)

In [None]:
price_pln

Unnamed: 0,0
Apple,760.0
CDP,228.0
Amazon,7220.0
KGHM,


In [None]:
price

Unnamed: 0,0
Apple,200.0
CDP,60.0
Amazon,1900.0
KGHM,


### <a name='a2'></a>  Podstawowe struktury danych: pd.DataFrame


In [None]:
df = pd.DataFrame(data = [12, 12, 32])
df

Unnamed: 0,0
0,12
1,12
2,32


In [None]:
df = pd.DataFrame(data = [12, 12, 32], index = ['first', 'second', ' third'], columns = ['col1'])
df

Unnamed: 0,col1
first,12
second,12
third,32


In [None]:
df = pd.DataFrame(data = {'WIG20': ['PKN ORLEN', 'PKO BP'], 'mWIG40': ['Amica', 'Playway']})
df

Unnamed: 0,WIG20,mWIG40
0,PKN ORLEN,Amica
1,PKO BP,Playway


In [68]:
df = pd.DataFrame(data = [[10, 12, 13], [23, 12, 10]], index = ['first', 'second'], columns = ['col1', 'col2', 'col3'])
df

Unnamed: 0,col1,col2,col3
first,10,12,13
second,23,12,10


In [70]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [72]:
df.index

Index(['first', 'second'], dtype='object')

In [71]:
df.values

array([[10, 12, 13],
       [23, 12, 10]])

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, first to second
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col1    2 non-null      int64
 1   col2    2 non-null      int64
 2   col3    2 non-null      int64
dtypes: int64(3)
memory usage: 172.0+ bytes


In [75]:
df.describe()

Unnamed: 0,col1,col2,col3
count,2.0,2.0,2.0
mean,16.5,12.0,11.5
std,9.192388,0.0,2.12132
min,10.0,12.0,10.0
25%,13.25,12.0,10.75
50%,16.5,12.0,11.5
75%,19.75,12.0,12.25
max,23.0,12.0,13.0


In [76]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
col1,2.0,16.5,9.192388,10.0,13.25,16.5,19.75,23.0
col2,2.0,12.0,0.0,12.0,12.0,12.0,12.0,12.0
col3,2.0,11.5,2.12132,10.0,10.75,11.5,12.25,13.0


### <a name='a3'> </a> Selekcja kolumn

In [77]:
df

Unnamed: 0,col1,col2,col3
first,10,12,13
second,23,12,10


In [78]:
df['col1']

Unnamed: 0,col1
first,10
second,23


In [79]:
type(df['col1'])

In [80]:
df[['col1']]

Unnamed: 0,col1
first,10
second,23


In [81]:
type(df[['col1']])

In [90]:
df.columns = ['a', 'sprzedaz_grudzien', 'c']
df

Unnamed: 0,a,sprzedaz_grudzien,c
first,10,12,13
second,23,12,10


In [85]:
df.a

Unnamed: 0,a
first,10
second,23


In [91]:
df.sprzedaz_grudzien

Unnamed: 0,sprzedaz_grudzien
first,12
second,12


In [92]:
df['d'] = df.a + df.c

In [93]:
df

Unnamed: 0,a,sprzedaz_grudzien,c,d
first,10,12,13,23
second,23,12,10,33


In [94]:
df = pd.DataFrame(data = [[10, 12, 13], [23, 12, 10]], index = ['first', 'second'], columns = ['col1', 'col2', 'col3'])
df

Unnamed: 0,col1,col2,col3
first,10,12,13
second,23,12,10


In [97]:
df.loc['first']

Unnamed: 0,first
col1,10
col2,12
col3,13


In [98]:
df.iloc[0]

Unnamed: 0,first
col1,10
col2,12
col3,13


In [100]:
df.loc['first', 'col2']

np.int64(12)

In [101]:
df.loc[:, 'col2']

Unnamed: 0,col2
first,12
second,12


In [103]:
df.iloc[0, 1]

np.int64(12)