# Pandas Module Functions

In [40]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [None]:
1. DataFrame() function: DataFrame creation
2. concat() function: concatenates pandas objects
3. merge() function: Merge DataFrame or named Series objects
4. join() function:Join columns of another DataFrame
5. multi-index() function
6. Index() function
7. date_range() function
8. read_csv() function
9. to_excel() function
10. get_dummies() function

## Dataframe Creation

### DataFrame() function

pd.DataFrame(`data`, `index`, `columns`, `dtype`, `copy`)

In [7]:
pd.DataFrame(randn(2,2), index = [0,1], columns = ["A", "B"])

Unnamed: 0,A,B
0,-1.200457,-0.708383
1,-0.85854,0.702687


#### from array

In [118]:
pd.DataFrame(np.arange(1,7).reshape(3,2), columns = ["A", "B"])

Unnamed: 0,A,B
0,1,2
1,3,4
2,5,6


#### new dataframe from existing

In [119]:
df = pd.DataFrame(np.arange(1,7).reshape(3,2), columns = ["A", "B"])
df

Unnamed: 0,A,B
0,1,2
1,3,4
2,5,6


In [121]:
newdf = df[df["A"]>2]
newdf

Unnamed: 0,A,B
1,3,4
2,5,6


#### from list

In [123]:
pd.DataFrame([1,2,39,67,90], columns = ["nums"])

Unnamed: 0,nums
0,1
1,2
2,39
3,67
4,90


### from dictionary

In [10]:
my_dict = {"var1": np.random.randint(10, size=5), "var2":np.random.randint(10, size=5), "var3":np.random.randint(10, size=5)}

In [11]:
pd.DataFrame(data=my_dict)

Unnamed: 0,var1,var2,var3
0,0,2,5
1,2,9,6
2,2,8,7
3,9,9,1
4,9,9,8


## DataFrame Birleştirme (concat, join, merge)

In [1]:
import pandas as pd
import numpy as np

### concat() function

`Docstring`: Concatenate pandas objects along a particular axis with optional set logic along the other axes.

In [55]:
df1 = pd.DataFrame({'A': ['A0', 'A1'],
                    'B': ['B0', 'B1']}, index=[0, 1])

In [60]:
df2 = pd.DataFrame({'A': ['A2', 'A3'],
                    'B': ['B2', 'B3']}, index=[2, 3]) 

In [61]:
df1

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [62]:
df2

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [63]:
pd.concat([df1,df2], axis = 0, join = "outer") # Default: axis: index, 
                                                   # join:outer, means birleşim yani tüm özgün satırları ekler

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [68]:
pd.concat([df1,df2], axis = 0, join = "inner")  # join:inner, kesişim

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [69]:
pd.concat([df1,df2], axis = 1, join = "outer") # Concatenation is done in columns

Unnamed: 0,A,B,A.1,B.1
0,A0,B0,,
1,A1,B1,,
2,,,A2,B2
3,,,A3,B3


In [70]:
pd.concat([df1,df2], axis=1)

Unnamed: 0,A,B,A.1,B.1
0,A0,B0,,
1,A1,B1,,
2,,,A2,B2
3,,,A3,B3


### Combine two ``Series``.

In [71]:
s1 = pd.Series(["a", "b"])
s2 = pd.Series(["c", "d"])

0    a
1    b
dtype: object

In [73]:
s1

0    a
1    b
dtype: object

In [72]:
s2

0    c
1    d
dtype: object

In [79]:
pd.concat([s1,s2], axis = 0, join = "outer")

0    a
1    b
0    c
1    d
dtype: object

In [80]:
pd.concat([s1,s2], axis = 0, join= "inner")

0    a
1    b
0    c
1    d
dtype: object

In [81]:
pd.concat([s1,s2], axis = 1, join= "outer")

Unnamed: 0,0,1
0,a,c
1,b,d


In [82]:
pd.concat([s1,s2], axis = 1, join= "inner")

Unnamed: 0,0,1
0,a,c
1,b,d


## merge() function

`Docstring`: Merge DataFrame or named Series objects with a database-style join.

In [90]:
df1

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [91]:
df2

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [92]:
pd.merge(df1, df2, how ="inner", on= None)

Unnamed: 0,A,B


In [93]:
left = pd.DataFrame({'key': ['K0', 'K1'],
                     'A': ['A0', 'A1'],
                     'B': ['B0', 'B1']})
   
right = pd.DataFrame({'key': ['K0', 'K1'],
                          'C': ['C0', 'C1'],
                          'D': ['D0', 'D1']})   

In [94]:
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1


In [95]:
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1


In [96]:
pd.merge(left,right, how="inner", on="key")

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1


In [97]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1'],
                     'key2': ['K0', 'K1', 'K0'],
                        'A': ['A0', 'A1', 'A2'],
                        'B': ['B0', 'B1', 'B2']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1'],
                               'key2': ['K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2'],
                                  'D': ['D0', 'D1', 'D2']})

In [98]:
left

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2


In [99]:
right

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2


In [101]:
pd.merge(left, right, how = "inner", on = "key1")

Unnamed: 0,key1,key2_x,A,B,key2_y,C,D
0,K0,K0,A0,B0,K0,C0,D0
1,K0,K1,A1,B1,K0,C0,D0
2,K1,K0,A2,B2,K0,C1,D1
3,K1,K0,A2,B2,K0,C2,D2


In [102]:
pd.merge(left, right, how = "inner", on = ["key1", "key2"])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


In [103]:
pd.merge(left, right, how="outer", on = ["key1", "key2"])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2


In [29]:
df111= pd.DataFrame({"lkey": ["x", "y", "z", "x"],
                    "lvalue": [2,3,5,7]})
df121= pd.DataFrame({"rkey": ["a", "b", "c", "b"],
                   "rvalue": [7,8,9,10]})

In [30]:
df111

Unnamed: 0,lkey,lvalue
0,x,2
1,y,3
2,z,5
3,x,7


In [31]:
df121

Unnamed: 0,rkey,rvalue
0,a,7
1,b,8
2,c,9
3,b,10


In [32]:
pd.merge(df111, df121, left_on = "lkey", right_on = "rkey")#ortak değer yok. merge fonk anlamsız. 

Unnamed: 0,lkey,lvalue,rkey,rvalue


In [33]:
pd.merge(df111, df121, how = "outer", left_on = "lkey", right_on = "rkey")

Unnamed: 0,lkey,lvalue,rkey,rvalue
0,x,2.0,,
1,x,7.0,,
2,y,3.0,,
3,z,5.0,,
4,,,a,7.0
5,,,b,8.0
6,,,b,10.0
7,,,c,9.0


In [34]:
pd.merge(df111, df121, how = "left", left_on = "lkey", right_on = "rkey")

Unnamed: 0,lkey,lvalue,rkey,rvalue
0,x,2,,
1,y,3,,
2,z,5,,
3,x,7,,


## join() function

`Docstring`: Join columns of another DataFrame.

In [None]:
		
		Ø Join function: 
			• convenient method for combining the columns of two potentially differently-indexed DataFrames into a single result DataFrame.
			• Join index üzerine çalışır. Default olarak. 
			• Ortak index üzerinden birleştir: 
				○ left.join(right, how="outer")
				○ Left tablosunun indexlerini esas alır, aynı indexteki sağ tablo satırlarını yanına koyar. 
			• Ortak sütun ismi üzerinde join:
				○  suffix üzerinden birleştirir. 
				○ df.join(other, lsuffix="_df", rsuffix="_other")
			
			• Kontrol imkanının az olduğu fonksiyon
			• İt will combine all the columns from the two tables, with the common columns renamed with the defined lsuffix, rsuffix. 
			• The way is defined by "how":
				○ İnner, outer aynı mantık, default indeksler baz alınır, innerda intersection, outer birleşim kümesi
				○ Left join, sol taraftaki indeksler baz alarak birleştirir. (0,1)ortak olan A sütununu suffix yapıyor. Sonek. 
Right join tersi

In [35]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [36]:
left

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2


In [37]:
right

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3


In [39]:
left.join(right)

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [40]:
left.join(right, how = "outer")

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,,,C3,D3


# self imposed exercises

# reminder

In [13]:
pd.Series([10,88,3,4,5])

0    10
1    88
2     3
3     4
4     5
dtype: int64

In [20]:
label = ["a","b","c"]
my_data = [10,20,30]
arr = np.array(my_data)
d={"a":10, "b":20, "c":30}

In [21]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [22]:
pd.Series(data=my_data, index= label)

a    10
b    20
c    30
dtype: int64

In [23]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

# exercise go on

In [24]:
s1 = pd.Series(["a","b"])

In [25]:
s1

0    a
1    b
dtype: object

In [26]:
s2= pd.Series(["c","d"])

In [27]:
pd.concat([s1,s2])

0    a
1    b
0    c
1    d
dtype: object

In [28]:
pd.concat([s1,s2], ignore_index=True)

0    a
1    b
2    c
3    d
dtype: object

In [32]:
pd.concat([s1,s2], keys= ["s1","s2"])

s1  0    a
    1    b
s2  0    c
    1    d
dtype: object

In [33]:
pd.concat([s1,s2], keys= ["s1","s2"], names= ["Serie Name", "Row ID"])

Serie Name  Row ID
s1          0         a
            1         b
s2          0         c
            1         d
dtype: object

In [34]:
df1= pd.DataFrame([["a", 1], ["b", 2]], columns = ["letter", "number"])

In [35]:
df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [36]:
df2= pd.DataFrame([["c", 3], ["d", 4]], columns = ["letter", "number"])

In [37]:
df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [38]:
pd.concat([df1,df2])

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [39]:
df3 = pd.DataFrame([["a", 3, "cat"], ["d", 4, "dog"]], columns = ["letter", "number", "animal"])

In [44]:
df3

Unnamed: 0,letter,number,animal
0,a,3,cat
1,d,4,dog


In [47]:
pd.concat([df1, df3], sort=False)

Unnamed: 0,letter,number,animal
0,a,1,
1,b,2,
0,a,3,cat
1,d,4,dog


In [50]:
pd.concat([df1, df3], join="inner")

Unnamed: 0,letter,number
0,a,1
1,b,2
0,a,3
1,d,4


In [51]:
df4= pd.DataFrame([["bird", "polly"], ["monkey", "george"]], columns=["animal", "name"])

In [52]:
df4

Unnamed: 0,animal,name
0,bird,polly
1,monkey,george


In [53]:
pd.concat([df1,df4], axis =1)

Unnamed: 0,letter,number,animal,name
0,a,1,bird,polly
1,b,2,monkey,george


# merge

In [124]:
df1 = pd.DataFrame(data = {"lkey": ["foo", "bar", "baz", "foo"], "value": [1,2,3,4]})                  

In [125]:
df1

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,4


In [126]:
df2= pd.DataFrame(data= {"rkey": ["foo", "bar", "baz", "foo"], "value": [5,6,7,8]})

In [127]:
df2

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [128]:
df1.merge(df2, left_on="lkey", right_on="rkey")

Unnamed: 0,lkey,value_x,rkey,value_y
0,foo,1,foo,5
1,foo,1,foo,8
2,foo,4,foo,5
3,foo,4,foo,8
4,bar,2,bar,6
5,baz,3,baz,7


In [132]:
pd.merge(df1,df2)

Unnamed: 0,lkey,value,rkey


# self devised example

In [91]:
ab = pd.DataFrame({"letters": ["a", "b", "c"],
                 "numbers": [1,2,3]})

In [92]:
ab

Unnamed: 0,letters,numbers
0,a,1
1,b,2
2,c,3


In [93]:
cd = pd.DataFrame({"letters": ["a", "b", "c"],
                 "numbers": [4,5,6]})

In [94]:
cd

Unnamed: 0,letters,numbers
0,a,4
1,b,5
2,c,6


In [97]:
pd.merge(ab, cd, on="letters")

Unnamed: 0,letters,numbers_x,numbers_y
0,a,1,4
1,b,2,5
2,c,3,6


In [154]:
arr1 = np.array([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])

In [155]:
arr1

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [156]:
df1 = pd.DataFrame(data=arr1, columns= ["a", "b", "c"])

In [157]:
df1

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9
3,10,11,12


In [158]:
arr2 = np.array([[1,2,3], [4,5,6], [8,8,9], [11,11,12]])

In [159]:
df2 = pd.DataFrame(data=arr2, columns= ["a", "b", "c"])

In [160]:
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,8,8,9
3,11,11,12


In [161]:
pd.merge(df1, df2, on="a")

Unnamed: 0,a,b_x,c_x,b_y,c_y
0,1,2,3,2,3
1,4,5,6,5,6


In [176]:
df = pd.DataFrame({"key": ["K0", "K1", "K2", "K3", "K4", "K5"], 
                  "A": ["A0", "A1", "A2", "A3", "A4", "A5"]})

In [177]:
df

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K2,A2
3,K3,A3
4,K4,A4
5,K5,A5


In [178]:
other=pd.DataFrame({"key": ["K0", "K1", "K2"],
                   "B": ["B0", "B1", "B2"]})

In [180]:
other

Unnamed: 0,key,B
0,K0,B0
1,K1,B1
2,K2,B2


In [182]:
df.join(other, how="outer")

ValueError: columns overlap but no suffix specified: Index(['key'], dtype='object')

## MultiIndex() Function

In [25]:
outside = ["G1","G1","G1","G2","G2","G2"]
inside = [1,2,3,1,2,3]

In [26]:
hier_index = list(zip(outside, inside)) # zip outside ve inside tuple yapar
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

### Creation of MultiIndex object 

In [27]:
hier_index = pd.MultiIndex.from_tuples(hier_index) # MultiIndex bu tuple'ları index hiyerarşisine sokar. 
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [28]:
type(hier_index)

pandas.core.indexes.multi.MultiIndex

### Introducing this MultiIndex object as index argument to the df 

In [29]:
df = pd.DataFrame(randn(6,2), index = hier_index, columns = ["A", "B"])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.986914,0.194408
G1,2,0.158756,0.057347
G1,3,-0.160251,-0.574811
G2,1,-0.201444,-0.298387
G2,2,2.019542,-0.807562
G2,3,0.643277,0.751793


### groupby level parameter

In [30]:
df.groupby(level = 0).mean()

Unnamed: 0,A,B
G1,0.328473,-0.107686
G2,0.820458,-0.118052


In [31]:
df.groupby(level = 1).mean()

Unnamed: 0,A,B
1,0.392735,-0.05199
2,1.089149,-0.375107
3,0.241513,0.088491


In [32]:
df.loc[("G1", 1), ("A")]

0.9869139913009349

In [33]:
df.loc[("G2",2),("B")]

-0.8075615361708935

In [37]:
df.loc["G1"].loc[1]["A"]

0.9869139913009349

In [None]:
index.names method.

In [35]:
df.index.names = ["Groups", "Nums"]

In [36]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Nums,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.986914,0.194408
G1,2,0.158756,0.057347
G1,3,-0.160251,-0.574811
G2,1,-0.201444,-0.298387
G2,2,2.019542,-0.807562
G2,3,0.643277,0.751793


### Cross section function

In [None]:
Docstring: Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular level of a MultiIndex.

In [38]:
df.xs("G1")

Unnamed: 0_level_0,A,B
Nums,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.986914,0.194408
2,0.158756,0.057347
3,-0.160251,-0.574811


In [39]:
df.xs(1, level="Nums")

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.986914,0.194408
G2,-0.201444,-0.298387


In [23]:
mylist = [["Falcon", "Falcon", "Parrot", "Parrot"], ["Captive", "Wild", "Captive", "Wild"]]

In [24]:
mylist

[['Falcon', 'Falcon', 'Parrot', 'Parrot'],
 ['Captive', 'Wild', 'Captive', 'Wild']]

In [25]:
index = pd.MultiIndex.from_arrays(mylist)

In [26]:
index

MultiIndex([('Falcon', 'Captive'),
            ('Falcon',    'Wild'),
            ('Parrot', 'Captive'),
            ('Parrot',    'Wild')],
           )

In [30]:
maxSpeed = {'Max Speed': [390., 350., 30., 20.]}
maxSpeed

{'Max Speed': [390.0, 350.0, 30.0, 20.0]}

In [31]:
df = pd.DataFrame(maxSpeed, index = index)
df

Unnamed: 0,Unnamed: 1,Max Speed
Falcon,Captive,390.0
Falcon,Wild,350.0
Parrot,Captive,30.0
Parrot,Wild,20.0


In [35]:
df.groupby(level = 0).mean()

Unnamed: 0,Max Speed
Falcon,370.0
Parrot,25.0


In [23]:
nums = 1,1,2,2,3,3
topic= "Math", "Science", "Math", "Science", "Math", "Science" 
index = pd.MultiIndex.from_tuples(zip(nums, topic))
df = pd.DataFrame({1: [8,9,7,6,5,2], 2:[6,5,8,4,8,7]}, index=index)
df

Unnamed: 0,Unnamed: 1,1,2
1,Math,8,6
1,Science,9,5
2,Math,7,8
2,Science,6,4
3,Math,5,8
3,Science,2,7



## pd.Index()

In [2]:
pd.Index([1,2,3])

Int64Index([1, 2, 3], dtype='int64')

In [3]:
type(pd.Index([1,2,3]))

pandas.core.indexes.numeric.Int64Index

In [7]:
pd.Index([1,2,3], name="nums")

Int64Index([1, 2, 3], dtype='int64', name='nums')

In [8]:
pd.Index(list('abc'))

Index(['a', 'b', 'c'], dtype='object')

## date_range() function

In [11]:
pd.date_range(start='1/1/2018', end='1/08/2018')

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
              dtype='datetime64[ns]', freq='D')

In [16]:
pd.date_range(start = "1/11/2021", end = "14/11/2021", periods = 10)# period: number of periods to generate.

DatetimeIndex(['2021-01-11 00:00:00', '2021-02-14 02:40:00',
               '2021-03-20 05:20:00', '2021-04-23 08:00:00',
               '2021-05-27 10:40:00', '2021-06-30 13:20:00',
               '2021-08-03 16:00:00', '2021-09-06 18:40:00',
               '2021-10-10 21:20:00', '2021-11-14 00:00:00'],
              dtype='datetime64[ns]', freq=None)

In [None]:
Changed the `freq` (frequency) to ``'M'`` (month end frequency).

In [12]:
pd.date_range(start='1/1/2018', periods=5, freq='M')

DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
               '2018-05-31'],
              dtype='datetime64[ns]', freq='M')

## pivot_table

In [2]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                          "bar", "bar", "bar", "bar"],
                    "B": ["one", "one", "one", "two", "two",
                          "one", "one", "two", "two"],
                    "C": ["small", "large", "large", "small",
                          "small", "large", "small", "small",
                          "large"],
                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                    "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [4]:
table = pd.pivot_table(df, values='D', index=['A', 'B'],
                     columns=['C'], aggfunc=np.sum)
table

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,5.0
bar,two,7.0,6.0
foo,one,4.0,1.0
foo,two,,6.0


In [3]:
pd.pivot_table(df, values = "D", index = ["A", "B"], columns = ["C"], aggfunc=np.sum, fill_value=0)

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4,5
bar,two,7,6
foo,one,4,1
foo,two,0,6


In [5]:
table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
                     aggfunc={'D': np.mean,
                              'E': np.mean})
table

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E
A,C,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,large,5.5,7.5
bar,small,5.5,8.5
foo,large,2.0,4.5
foo,small,2.333333,4.333333


In [1]:
import seaborn as sns

In [2]:
titanic = sns.load_dataset("titanic")

In [4]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
titanic.groupby("sex")[["survived"]].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [5]:
titanic.groupby(["sex", "class"])[["survived"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,class,Unnamed: 2_level_1
female,First,0.968085
female,Second,0.921053
female,Third,0.5
male,First,0.368852
male,Second,0.157407
male,Third,0.135447


In [6]:
titanic.groupby(["sex", "class"])[["survived"]].aggregate("mean").unstack()

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [7]:
titanic.pivot_table("survived", index = "sex", columns = "class")#survive sütunun mean değerlerini dağıtır. 

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [8]:
titanic.pivot_table("age", index = "sex", columns = "class")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,34.611765,28.722973,21.75
male,41.281386,30.740707,26.507589


In [9]:
titanic.pivot_table("age", index = "class", columns = "sex")

sex,female,male
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,34.611765,41.281386
Second,28.722973,30.740707
Third,21.75,26.507589


In [10]:
flights = sns.load_dataset('flights')
flights.head()

Unnamed: 0,year,month,passengers
0,1949,January,112
1,1949,February,118
2,1949,March,132
3,1949,April,129
4,1949,May,121


In [11]:
flights.pivot_table("passengers", index = "month", columns = "year" )

year,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
January,112,115,145,171,196,204,242,284,315,340,360,417
February,118,126,150,180,196,188,233,277,301,318,342,391
March,132,141,178,193,236,235,267,317,356,362,406,419
April,129,135,163,181,235,227,269,313,348,348,396,461
May,121,125,172,183,229,234,270,318,355,363,420,472
June,135,149,178,218,243,264,315,374,422,435,472,535
July,148,170,199,230,264,302,364,413,465,491,548,622
August,148,170,199,242,272,293,347,405,467,505,559,606
September,136,158,184,209,237,259,312,355,404,404,463,508
October,119,133,162,191,211,229,274,306,347,359,407,461


## read_csv()

In [None]:
Docstring: 
Read a comma-separated values (csv) file into DataFrame. 
Also supports optionally iterating or breaking of the file into chunks.

In [6]:
pd.read_csv("example.csv")

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


## to_excel() function

In [5]:
df = pd.DataFrame([['a', 'b'], ['c', 'd']],
                    index=['row 1', 'row 2'],
                    columns=['col 1', 'col 2'])
df

Unnamed: 0,col 1,col 2
row 1,a,b
row 2,c,d


In [7]:
df.to_excel("C:/Users/Owner/Desktop/output.xlsx")  # doctest: +SKIP

In [8]:
df.to_excel("C:/Users/Owner/Desktop/output.xlsx", sheet_name='o yeah')  # doctest: +SKIP, To specify the sheet name:

In [None]:
If you wish to write to more than one sheet in the workbook, it is necessary to specify an ExcelWriter object:

In [11]:
df1 = df.copy()
with pd.ExcelWriter('C:/Users/Owner/Desktop/output.xlsx') as writer:  # doctest: +SKIP
    df.to_excel(writer, sheet_name='come on')
    df1.to_excel(writer, sheet_name='man!!')

In [13]:
with pd.ExcelWriter('C:/Users/Owner/Desktop/output.xlsx', mode='A') as writer:  # doctest: +SKIP
    df.to_excel(writer, sheet_name='hold on!!')

In [None]:
To set the library that is used to write the Excel file, you can pass the `engine` keyword (the default engine is
automatically chosen depending on the file extension):

In [14]:
df.to_excel('C:/Users/Owner/Desktop/output1.xlsx', engine='xlsxwriter')  # doctest: +SKIP

In [2]:
pd.Series([1,2,3], index=["a", "b", "c"])

a    1
b    2
c    3
dtype: int64

In [3]:
type(pd.Series([1,2,3]))

pandas.core.series.Series

In [4]:
pd.Series([1,2,3]).dtype

dtype('int64')

In [5]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [6]:
df.dtypes

col1    int64
col2    int64
dtype: object

In [7]:
df.astype("float")

Unnamed: 0,col1,col2
0,1.0,3.0
1,2.0,4.0


In [8]:
s = pd.Series(['apple', '1.0', '2', -3])

In [9]:
s

0    apple
1      1.0
2        2
3       -3
dtype: object

In [10]:
pd.to_numeric(s, errors='ignore')

0    apple
1      1.0
2        2
3       -3
dtype: object

## get_dummies() function

In [46]:
s = pd.Series(list("abca"))
s

0    a
1    b
2    c
3    a
dtype: object

In [49]:
pd.get_dummies(s) # data : array-like, Series, or DataFrame. Data of which to get dummy indicators.

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


### dummy_na argument

In [47]:
pd.get_dummies(["a", "b", np.nan])

Unnamed: 0,a,b
0,1,0
1,0,1
2,0,0


In [48]:
pd.get_dummies(["a", "b", np.nan], dummy_na=True)

Unnamed: 0,a,b,NaN
0,1,0,0
1,0,1,0
2,0,0,1


In [50]:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]})
df

Unnamed: 0,A,B,C
0,a,b,1
1,b,a,2
2,a,c,3


In [51]:
pd.get_dummies(df)

Unnamed: 0,C,A_a,A_b,B_a,B_b,B_c
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


In [52]:
pd.get_dummies(df, prefix=["col1", "col2"])

Unnamed: 0,C,col1_a,col1_b,col2_a,col2_b,col2_c
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


In [53]:
pd.get_dummies(pd.Series(list('abcaa')))

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,1,0,0


In [54]:
pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)

Unnamed: 0,b,c
0,0,0
1,1,0
2,0,1
3,0,0
4,0,0


In [55]:
pd.get_dummies(pd.Series(list('abc')), dtype=float)

Unnamed: 0,a,b,c
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0


In [56]:
df = pd.DataFrame({"gender": ["male", "female", "female", "male"],
                   "age": [17,34,43,23]})
df

Unnamed: 0,gender,age
0,male,17
1,female,34
2,female,43
3,male,23


In [57]:
pd.get_dummies(df)

Unnamed: 0,age,gender_female,gender_male
0,17,0,1
1,34,1,0
2,43,1,0
3,23,0,1


In [58]:
pd.get_dummies(df, drop_first=True)

Unnamed: 0,age,gender_male
0,17,1
1,34,0
2,43,0
3,23,1


In [59]:
map_dict = {"male" : 1, "female" : 0}
df["original"] = df.gender.map(map_dict)
df

Unnamed: 0,gender,age,original
0,male,17,1
1,female,34,0
2,female,43,0
3,male,23,1


In [60]:
df.drop("gender", axis=1,inplace=True)

In [61]:
df

Unnamed: 0,age,original
0,17,1
1,34,0
2,43,0
3,23,1
