# Pandas Module Functions

In [1]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [None]:
1. DataFrame() function: DataFrame creation
2. concat() function: concatenates pandas objects
3. merge() function: Merge DataFrame or named Series objects
4. join() function:Join columns of another DataFrame
5. multi-index() function

## Dataframe Creation

### DataFrame() function

pd.DataFrame(`data`, `index`, `columns`, `dtype`, `copy`)

In [108]:
pd.DataFrame(randn(2,2), index = [0,1], columns = ["A", "B"])

Unnamed: 0,A,B
0,-0.162914,-0.040813
1,1.014284,-0.753487


#### from array

In [118]:
pd.DataFrame(np.arange(1,7).reshape(3,2), columns = ["A", "B"])

Unnamed: 0,A,B
0,1,2
1,3,4
2,5,6


#### new dataframe from existing

In [119]:
df = pd.DataFrame(np.arange(1,7).reshape(3,2), columns = ["A", "B"])
df

Unnamed: 0,A,B
0,1,2
1,3,4
2,5,6


In [121]:
newdf = df[df["A"]>2]
newdf

Unnamed: 0,A,B
1,3,4
2,5,6


#### from list

In [123]:
pd.DataFrame([1,2,39,67,90], columns = ["nums"])

Unnamed: 0,nums
0,1
1,2
2,39
3,67
4,90


### from dictionary

In [10]:
my_dict = {"var1": np.random.randint(10, size=5), "var2":np.random.randint(10, size=5), "var3":np.random.randint(10, size=5)}

In [11]:
pd.DataFrame(data=my_dict)

Unnamed: 0,var1,var2,var3
0,0,2,5
1,2,9,6
2,2,8,7
3,9,9,1
4,9,9,8


## DataFrame Birleştirme (concat, join, merge)

In [1]:
import pandas as pd
import numpy as np

### concat() function

`Docstring`: Concatenate pandas objects along a particular axis with optional set logic along the other axes.

In [55]:
df1 = pd.DataFrame({'A': ['A0', 'A1'],
                        'B': ['B0', 'B1']}, index=[0, 1])

In [60]:
df2 = pd.DataFrame({'A': ['A2', 'A3'],
                        'B': ['B2', 'B3']},
                         index=[2, 3]) 

In [61]:
df1

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [62]:
df2

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [63]:
pd.concat([df1,df2], axis = 0, join = "outer") # Default: axis: index, 
                                                   # join:outer, means birleşim yani tüm özgün satırları ekler

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [68]:
pd.concat([df1,df2], axis = 0, join = "inner")  # join:inner, kesişim

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [69]:
pd.concat([df1,df2], axis = 1, join = "outer") # Concatenation is done in columns

Unnamed: 0,A,B,A.1,B.1
0,A0,B0,,
1,A1,B1,,
2,,,A2,B2
3,,,A3,B3


In [70]:
pd.concat([df1,df2], axis=1)

Unnamed: 0,A,B,A.1,B.1
0,A0,B0,,
1,A1,B1,,
2,,,A2,B2
3,,,A3,B3


### Combine two ``Series``.

In [71]:
s1 = pd.Series(["a", "b"])
s2 = pd.Series(["c", "d"])

0    a
1    b
dtype: object

In [73]:
s1

0    a
1    b
dtype: object

In [72]:
s2

0    c
1    d
dtype: object

In [79]:
pd.concat([s1,s2], axis = 0, join = "outer")

0    a
1    b
0    c
1    d
dtype: object

In [80]:
pd.concat([s1,s2], axis = 0, join= "inner")

0    a
1    b
0    c
1    d
dtype: object

In [81]:
pd.concat([s1,s2], axis = 1, join= "outer")

Unnamed: 0,0,1
0,a,c
1,b,d


In [82]:
pd.concat([s1,s2], axis = 1, join= "inner")

Unnamed: 0,0,1
0,a,c
1,b,d


## merge() function

`Docstring`: Merge DataFrame or named Series objects with a database-style join.

In [90]:
df1

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [91]:
df2

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [92]:
pd.merge(df1, df2, how ="inner", on= None)

Unnamed: 0,A,B


In [93]:
left = pd.DataFrame({'key': ['K0', 'K1'],
                     'A': ['A0', 'A1'],
                     'B': ['B0', 'B1']})
   
right = pd.DataFrame({'key': ['K0', 'K1'],
                          'C': ['C0', 'C1'],
                          'D': ['D0', 'D1']})   

In [94]:
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1


In [95]:
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1


In [96]:
pd.merge(left,right, how="inner", on="key")

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1


In [97]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1'],
                     'key2': ['K0', 'K1', 'K0'],
                        'A': ['A0', 'A1', 'A2'],
                        'B': ['B0', 'B1', 'B2']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1'],
                               'key2': ['K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2'],
                                  'D': ['D0', 'D1', 'D2']})

In [98]:
left

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2


In [99]:
right

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2


In [101]:
pd.merge(left, right, how = "inner", on = "key1")

Unnamed: 0,key1,key2_x,A,B,key2_y,C,D
0,K0,K0,A0,B0,K0,C0,D0
1,K0,K1,A1,B1,K0,C0,D0
2,K1,K0,A2,B2,K0,C1,D1
3,K1,K0,A2,B2,K0,C2,D2


In [102]:
pd.merge(left, right, how = "inner", on = ["key1", "key2"])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


In [103]:
pd.merge(left, right, how="outer", on = ["key1", "key2"])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2


In [29]:
df111= pd.DataFrame({"lkey": ["x", "y", "z", "x"],
                    "lvalue": [2,3,5,7]})
df121= pd.DataFrame({"rkey": ["a", "b", "c", "b"],
                   "rvalue": [7,8,9,10]})

In [30]:
df111

Unnamed: 0,lkey,lvalue
0,x,2
1,y,3
2,z,5
3,x,7


In [31]:
df121

Unnamed: 0,rkey,rvalue
0,a,7
1,b,8
2,c,9
3,b,10


In [32]:
pd.merge(df111, df121, left_on = "lkey", right_on = "rkey")#ortak değer yok. merge fonk anlamsız. 

Unnamed: 0,lkey,lvalue,rkey,rvalue


In [33]:
pd.merge(df111, df121, how = "outer", left_on = "lkey", right_on = "rkey")

Unnamed: 0,lkey,lvalue,rkey,rvalue
0,x,2.0,,
1,x,7.0,,
2,y,3.0,,
3,z,5.0,,
4,,,a,7.0
5,,,b,8.0
6,,,b,10.0
7,,,c,9.0


In [34]:
pd.merge(df111, df121, how = "left", left_on = "lkey", right_on = "rkey")

Unnamed: 0,lkey,lvalue,rkey,rvalue
0,x,2,,
1,y,3,,
2,z,5,,
3,x,7,,


## join() function

`Docstring`: Join columns of another DataFrame.

In [None]:
		
		Ø Join function: 
			• convenient method for combining the columns of two potentially differently-indexed DataFrames into a single result DataFrame.
			• Join index üzerine çalışır. Default olarak. 
			• Ortak index üzerinden birleştir: 
				○ left.join(right, how="outer")
				○ Left tablosunun indexlerini esas alır, aynı indexteki sağ tablo satırlarını yanına koyar. 
			• Ortak sütun ismi üzerinde join:
				○  suffix üzerinden birleştirir. 
				○ df.join(other, lsuffix="_df", rsuffix="_other")
			
			• Kontrol imkanının az olduğu fonksiyon
			• İt will combine all the columns from the two tables, with the common columns renamed with the defined lsuffix, rsuffix. 
			• The way is defined by "how":
				○ İnner, outer aynı mantık, default indeksler baz alınır, innerda intersection, outer birleşim kümesi
				○ Left join, sol taraftaki indeksler baz alarak birleştirir. (0,1)ortak olan A sütununu suffix yapıyor. Sonek. 
Right join tersi

In [35]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [36]:
left

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2


In [37]:
right

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3


In [39]:
left.join(right)

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [40]:
left.join(right, how = "outer")

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,,,C3,D3


# self imposed exercises

# reminder

In [13]:
pd.Series([10,88,3,4,5])

0    10
1    88
2     3
3     4
4     5
dtype: int64

In [20]:
label = ["a","b","c"]
my_data = [10,20,30]
arr = np.array(my_data)
d={"a":10, "b":20, "c":30}

In [21]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [22]:
pd.Series(data=my_data, index= label)

a    10
b    20
c    30
dtype: int64

In [23]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

# exercise go on

In [24]:
s1 = pd.Series(["a","b"])

In [25]:
s1

0    a
1    b
dtype: object

In [26]:
s2= pd.Series(["c","d"])

In [27]:
pd.concat([s1,s2])

0    a
1    b
0    c
1    d
dtype: object

In [28]:
pd.concat([s1,s2], ignore_index=True)

0    a
1    b
2    c
3    d
dtype: object

In [32]:
pd.concat([s1,s2], keys= ["s1","s2"])

s1  0    a
    1    b
s2  0    c
    1    d
dtype: object

In [33]:
pd.concat([s1,s2], keys= ["s1","s2"], names= ["Serie Name", "Row ID"])

Serie Name  Row ID
s1          0         a
            1         b
s2          0         c
            1         d
dtype: object

In [34]:
df1= pd.DataFrame([["a", 1], ["b", 2]], columns = ["letter", "number"])

In [35]:
df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [36]:
df2= pd.DataFrame([["c", 3], ["d", 4]], columns = ["letter", "number"])

In [37]:
df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [38]:
pd.concat([df1,df2])

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [39]:
df3 = pd.DataFrame([["a", 3, "cat"], ["d", 4, "dog"]], columns = ["letter", "number", "animal"])

In [44]:
df3

Unnamed: 0,letter,number,animal
0,a,3,cat
1,d,4,dog


In [47]:
pd.concat([df1, df3], sort=False)

Unnamed: 0,letter,number,animal
0,a,1,
1,b,2,
0,a,3,cat
1,d,4,dog


In [50]:
pd.concat([df1, df3], join="inner")

Unnamed: 0,letter,number
0,a,1
1,b,2
0,a,3
1,d,4


In [51]:
df4= pd.DataFrame([["bird", "polly"], ["monkey", "george"]], columns=["animal", "name"])

In [52]:
df4

Unnamed: 0,animal,name
0,bird,polly
1,monkey,george


In [53]:
pd.concat([df1,df4], axis =1)

Unnamed: 0,letter,number,animal,name
0,a,1,bird,polly
1,b,2,monkey,george


# merge

In [124]:
df1 = pd.DataFrame(data = {"lkey": ["foo", "bar", "baz", "foo"], "value": [1,2,3,4]})                  

In [125]:
df1

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,4


In [126]:
df2= pd.DataFrame(data= {"rkey": ["foo", "bar", "baz", "foo"], "value": [5,6,7,8]})

In [127]:
df2

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [128]:
df1.merge(df2, left_on="lkey", right_on="rkey")

Unnamed: 0,lkey,value_x,rkey,value_y
0,foo,1,foo,5
1,foo,1,foo,8
2,foo,4,foo,5
3,foo,4,foo,8
4,bar,2,bar,6
5,baz,3,baz,7


In [132]:
pd.merge(df1,df2)

Unnamed: 0,lkey,value,rkey


# self devised example

In [91]:
ab = pd.DataFrame({"letters": ["a", "b", "c"],
                 "numbers": [1,2,3]})

In [92]:
ab

Unnamed: 0,letters,numbers
0,a,1
1,b,2
2,c,3


In [93]:
cd = pd.DataFrame({"letters": ["a", "b", "c"],
                 "numbers": [4,5,6]})

In [94]:
cd

Unnamed: 0,letters,numbers
0,a,4
1,b,5
2,c,6


In [97]:
pd.merge(ab, cd, on="letters")

Unnamed: 0,letters,numbers_x,numbers_y
0,a,1,4
1,b,2,5
2,c,3,6


In [154]:
arr1 = np.array([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])

In [155]:
arr1

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [156]:
df1 = pd.DataFrame(data=arr1, columns= ["a", "b", "c"])

In [157]:
df1

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9
3,10,11,12


In [158]:
arr2 = np.array([[1,2,3], [4,5,6], [8,8,9], [11,11,12]])

In [159]:
df2 = pd.DataFrame(data=arr2, columns= ["a", "b", "c"])

In [160]:
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,8,8,9
3,11,11,12


In [161]:
pd.merge(df1, df2, on="a")

Unnamed: 0,a,b_x,c_x,b_y,c_y
0,1,2,3,2,3
1,4,5,6,5,6


In [176]:
df = pd.DataFrame({"key": ["K0", "K1", "K2", "K3", "K4", "K5"], 
                  "A": ["A0", "A1", "A2", "A3", "A4", "A5"]})

In [177]:
df

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K2,A2
3,K3,A3
4,K4,A4
5,K5,A5


In [178]:
other=pd.DataFrame({"key": ["K0", "K1", "K2"],
                   "B": ["B0", "B1", "B2"]})

In [180]:
other

Unnamed: 0,key,B
0,K0,B0
1,K1,B1
2,K2,B2


In [182]:
df.join(other, how="outer")

ValueError: columns overlap but no suffix specified: Index(['key'], dtype='object')

## MultiIndex() Function

`Docstring`: A multi-level, or hierarchical, index object for pandas objects.

`Type`: type

In [10]:
outside = ["G1","G1","G1","G2","G2","G2"]
inside = [1,2,3,1,2,3]

In [11]:
hier_index = list(zip(outside, inside)) # zip outside ve inside tuple yapar
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

### Creation of MultiIndex object 

In [20]:
hier_index = pd.MultiIndex.from_tuples(hier_index) # MultiIndex bu tuple'ları index hiyerarşisine sokar. 
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [21]:
type(hier_index)

pandas.core.indexes.multi.MultiIndex

### Introducing this MultiIndex object as index argument to the df 

In [22]:
df = pd.DataFrame(randn(6,2), index = hier_index, columns = ["A", "B"])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.349432,-1.470956
G1,2,-1.899912,-0.082381
G1,3,0.242107,-0.277123
G2,1,-0.728832,-0.471035
G2,2,-2.210344,-0.199412
G2,3,1.579513,1.111629


### Level parameter

In [23]:
mylist = [["Falcon", "Falcon", "Parrot", "Parrot"], ["Captive", "Wild", "Captive", "Wild"]]

In [24]:
mylist

[['Falcon', 'Falcon', 'Parrot', 'Parrot'],
 ['Captive', 'Wild', 'Captive', 'Wild']]

In [25]:
index = pd.MultiIndex.from_arrays(mylist)

In [26]:
index

MultiIndex([('Falcon', 'Captive'),
            ('Falcon',    'Wild'),
            ('Parrot', 'Captive'),
            ('Parrot',    'Wild')],
           )

In [30]:
maxSpeed = {'Max Speed': [390., 350., 30., 20.]}
maxSpeed

{'Max Speed': [390.0, 350.0, 30.0, 20.0]}

In [31]:
df = pd.DataFrame(maxSpeed, index = index)
df

Unnamed: 0,Unnamed: 1,Max Speed
Falcon,Captive,390.0
Falcon,Wild,350.0
Parrot,Captive,30.0
Parrot,Wild,20.0


In [35]:
df.groupby(level = 0).mean()

Unnamed: 0,Max Speed
Falcon,370.0
Parrot,25.0
