In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])

In [4]:
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [5]:
print(data.index)

RangeIndex(start=0, stop=4, step=1)


In [6]:
print(data.values)

[0.25 0.5  0.75 1.  ]


In [7]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])  # ustalamy indeks jako obiekty typu `str`

In [8]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

In [10]:
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [11]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [15]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)

In [16]:
states = pd.DataFrame({'population': population,
                       'area': area
                      })  # Tworzymy obiekt typu `DataFrame`


In [18]:
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [19]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [20]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data


1    a
3    b
5    c
dtype: object

In [21]:
data[1]  # indeks typu `Series`

'a'

In [22]:
data[1:3]  # indeks z numpy

3    b
5    c
dtype: object

In [23]:
data.loc[1] #loc używa indeksu z Series:

'a'

In [25]:
data.iloc[1] #iloc korzysta z indeksów tablicy numpy

'b'

In [26]:
data.iloc[1:3] #iloc korzysta z indeksów tablicy numpy

3    b
5    c
dtype: object

In [30]:
#brakujące dane w Pandas
vals1 = np.array([1, None, 3, 4])


In [31]:
vals1

array([1, None, 3, 4], dtype=object)

In [32]:
vals2 = np.array([1, np.nan, 3, 4]) 

In [33]:
vals2.dtype

dtype('float64')

In [34]:
vals2

array([ 1., nan,  3.,  4.])

In [35]:
vals2.sum()

nan

In [36]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [37]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

In [42]:
data = pd.Series([1, np.nan, 'hello', None])

In [43]:
data.isnull()  # znajdowanie pustych wartości

0    False
1     True
2    False
3     True
dtype: bool

In [44]:
data[data.notnull()]  # wybieranie niepustych wartości

0        1
2    hello
dtype: object

In [45]:
data.dropna()  # usuwanie pustych wartości

0        1
2    hello
dtype: object

In [47]:
data.fillna(0)  # wypełnianie pustych wartości

0        1
1        0
2    hello
3        0
dtype: object

In [48]:
###Łączenie DataFrame w Pandas

In [49]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])

In [50]:
pd.concat([ser1, ser2])  #Funkcją służącą do łączenia DataFrame jest pd.concat, która w działaniu jest podobna do np.concatenate

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [53]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

In [54]:
df3 = pd.merge(df1, df2) #pd.merge samo rozpoznało wspólną kolumnę employee i połączyło tabele z jej użyciem
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [55]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})

In [56]:
pd.merge(df3, df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [57]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})

In [58]:
pd.merge(df1, df5)

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


In [60]:
pd.merge(df1, df2, on='employee') #po której kolumnie chcemy wykonać łączenie możemy podać ją przez parametr on:

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [61]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})

In [62]:
#eżeli tabele mają różne nazwy kolumn po których chcemy je połączyć możemy skorzystać z left_on i right_on

In [63]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [64]:
df3

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


In [65]:
pd.merge(df1, df3, left_on="employee", right_on="name")

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,80000
2,Lisa,Engineering,Lisa,120000
3,Sue,HR,Sue,90000


In [66]:
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1) #zbedną kolumnę, możemy ją usunąc przez drop:

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


In [69]:
def make_df(cols, ind):
    """Stwórz DataFrame z kombinacji cols i ind"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)



In [70]:
df1 = make_df('AB', [1, 2, 4])
df2 = make_df('ABCD', [1, 2, 5, 6])

In [71]:
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
4,A4,B4


In [72]:
df2

Unnamed: 0,A,B,C,D
1,A1,B1,C1,D1
2,A2,B2,C2,D2
5,A5,B5,C5,D5
6,A6,B6,C6,D6


In [73]:
pd.merge(df1, df2, on='A', how='left') #generują się nowe indeksy

Unnamed: 0,A,B_x,B_y,C,D
0,A1,B1,B1,C1,D1
1,A2,B2,B2,C2,D2
2,A4,B4,,,


In [74]:
pd.merge(df1, df2, on='A', how='right')

Unnamed: 0,A,B_x,B_y,C,D
0,A1,B1,B1,C1,D1
1,A2,B2,B2,C2,D2
2,A5,,B5,C5,D5
3,A6,,B6,C6,D6


In [75]:
pd.merge(df1, df2, on='A', how='outer') #nie tracimy danych duplikujemy

Unnamed: 0,A,B_x,B_y,C,D
0,A1,B1,B1,C1,D1
1,A2,B2,B2,C2,D2
2,A4,B4,,,
3,A5,,B5,C5,D5
4,A6,,B6,C6,D6


In [76]:
pd.merge(df1, df2, on='A', how='inner') 

Unnamed: 0,A,B_x,B_y,C,D
0,A1,B1,B1,C1,D1
1,A2,B2,B2,C2,D2


In [77]:
#Agregacja i grupowanie

In [78]:
df = pd.DataFrame({'A': np.random.rand(5),
                   'B': np.random.rand(5)})

In [79]:
df

Unnamed: 0,A,B
0,0.32627,0.889953
1,0.869569,0.216077
2,0.813881,0.311809
3,0.188347,0.159953
4,0.001571,0.59892


In [80]:
df.mean()

A    0.439928
B    0.435343
dtype: float64

In [81]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])


In [82]:
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [83]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000215CC25D720>

In [84]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [85]:
df.groupby('key')['data'].median()

key
A    1.5
B    2.5
C    3.5
Name: data, dtype: float64