___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___

<h1><p style="text-align: center;">Pandas Lesson, Session - 4</p><h1>
    

# Data Frames

 - ### ``DataFrames`` are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [1]:
import pandas as pd
import numpy as np

 - ### Creating a DataFrame using the ``list``s of data and columns

In [2]:
datas= [1, 3, 5, 7, 9]
datas

[1, 3, 5, 7, 9]

In [3]:
pd.DataFrame(datas)

Unnamed: 0,0
0,1
1,3
2,5
3,7
4,9


In [None]:
pd.DataFrame(datas, columns = [])

 - ### Creating a DataFrame using a ``NumPy Arrays``

In [4]:
m =np.arange(1, 24, 2).reshape(3, 4)
m

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [5]:
pd.DataFrame(m, columns= ['var1', 'var2', 'var3', 'var4'])

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [6]:
df =pd.DataFrame(m, columns= ['var1', 'var2', 'var3', 'var4'])
df

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [7]:
df.head(1)

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7


In [8]:
df.tail(2)

Unnamed: 0,var1,var2,var3,var4
1,9,11,13,15
2,17,19,21,23


In [9]:
df.columns

Index(['var1', 'var2', 'var3', 'var4'], dtype='object')

In [10]:
for i in df.columns:
    print(i)

var1
var2
var3
var4


In [11]:
df.columns=['new1', 'new2', 'new3', 'new4']
df

Unnamed: 0,new1,new2,new3,new4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [12]:
type(df)

pandas.core.frame.DataFrame

In [13]:
df.shape

(3, 4)

In [14]:
df.shape[0]

3

In [15]:
df.size

12

In [16]:
df.values

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [17]:
type(df.values)

numpy.ndarray

 - ### Creating a DataFrame using a ``dict``

In [18]:
s1 = np.random.randint(2, 10, size = 4)
s2 = np.random.randint(3, 10, size = 4)
s3 = np.random.randint(4, 15, size = 4)

In [19]:
s1

array([9, 8, 7, 9])

In [20]:
s2

array([3, 9, 9, 8])

In [21]:
s3

array([ 7,  5,  4, 13])

In [22]:
mydict = {"var1": s1, "var2": s2, "var3": s3}
mydict

{'var1': array([9, 8, 7, 9]),
 'var2': array([3, 9, 9, 8]),
 'var3': array([ 7,  5,  4, 13])}

In [24]:
df1= pd.DataFrame(mydict)
df1

Unnamed: 0,var1,var2,var3
0,9,3,7
1,8,9,5
2,7,9,4
3,9,8,13


In [27]:
df1.index

RangeIndex(start=0, stop=4, step=1)

In [28]:
[i for i in df1.index]

[0, 1, 2, 3]

In [29]:
df1.index= ['a', 'b', 'c', 'd']

In [30]:
df1

Unnamed: 0,var1,var2,var3
a,9,3,7
b,8,9,5
c,7,9,4
d,9,8,13


In [31]:
'var1' in df1

True

In [32]:
'var5' in df1

False

### Now, let's examine again the ***indexing, selection*** and ***slicing*** methods and several ***attributes*** using a different DataFrame

In [33]:
from numpy.random import randn
np.random.seed(101)

In [None]:
# df3 = pd.DataFrame(randn(5, 4), index = ['A', 'B', 'C', 'D','E'] ,columns=

In [34]:
df3 = pd.DataFrame(randn(5, 4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [37]:
'A B C D E'.split()

['A', 'B', 'C', 'D', 'E']

In [35]:
# creating a DataFrame by "positional arguments"
pd.DataFrame(randn(5, 4), 'a b c d e'.split(), 'w x y z'.split())

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
b,-0.134841,0.390528,0.166905,0.184502
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752
e,-0.116773,1.901755,0.238127,1.996652


In [36]:
# creating a DataFrame by "keyword arguments"
pd.DataFrame(data=randn(5, 4), columns='w x y z'.split(), index='a b c d e'.split())

Unnamed: 0,w,x,y,z
a,-0.993263,0.1968,-1.136645,0.000366
b,1.025984,-0.156598,-0.031579,0.649826
c,2.154846,-0.610259,-0.755325,-0.346419
d,0.147027,-0.479448,0.558769,1.02481
e,-0.925874,1.862864,-1.133817,0.610478


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [38]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [39]:
df3['Y']

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [41]:
df3.Y         # SQL syntax   # cok önerilmiyor, iki kelimeli column isimlerinde boslukdan kaynakli hatalar olusabiliyor

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [40]:
df3[['Y']]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


#### DataFrame Columns are just Series

In [42]:
type(df3['Y'])

pandas.core.series.Series

In [43]:
type(df3[['Y']])

pandas.core.frame.DataFrame

In [45]:
df3[['Y', 'Z']]         # burda iki taneköseli parantez kullanmak zorundayz. bu artik seri degil dataframe o yüzden hata veriyor tek parantez

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
C,0.528813,-0.589001
D,-0.933237,0.955057
E,2.605967,0.683509


In [46]:
sample_list = ['Y', 'Z']
df3[sample_list]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
C,0.528813,-0.589001
D,-0.933237,0.955057
E,2.605967,0.683509


In [47]:
df3['W': 'Y']

Unnamed: 0,W,X,Y,Z


In [48]:
df3['A': 'D']         # slicingi biz satirlarda kullanabiliyoruz

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [50]:
df3[['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813
D,0.188695,-0.933237
E,0.190794,2.605967


In [51]:
df3[['A', 'D']]

KeyError: "None of [Index(['A', 'D'], dtype='object')] are in the [columns]"

In [52]:
df3['A': 'C'][['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813


**Creating a new column:**

In [53]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [71]:
df3['Multiplied']= df3['X'] * df3['Y']
df3

Unnamed: 0,W,X,Y,Z,sample,empty2,Multiplied
A,2.70685,0.628133,0.907969,0.503826,1,,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,2,,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,3,,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,4,,0.708208
E,0.190794,1.978757,2.605967,0.683509,5,,5.156577


In [57]:
df3['sample'] = [1, 2, 3, 4, 5]

In [58]:
df3

Unnamed: 0,W,X,Y,Z,Multiplied,sample
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [59]:
df3['empty']= ""
df3

Unnamed: 0,W,X,Y,Z,Multiplied,sample,empty
A,2.70685,0.628133,0.907969,0.503826,0.570325,1,
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2,
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3,
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4,
E,0.190794,1.978757,2.605967,0.683509,5.156577,5,


In [70]:
df3['empty2']= np.nan
df3

Unnamed: 0,W,X,Y,Z,sample,empty2
A,2.70685,0.628133,0.907969,0.503826,1,
B,0.651118,-0.319318,-0.848077,0.605965,2,
C,-2.018168,0.740122,0.528813,-0.589001,3,
D,0.188695,-0.758872,-0.933237,0.955057,4,
E,0.190794,1.978757,2.605967,0.683509,5,


### [Removing Columns & Rows](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-drop.ipynb)

 - ### Removing Columns

In [63]:
df3.drop('empty', axis= 1)


Unnamed: 0,W,X,Y,Z,Multiplied,sample,empty2
A,2.70685,0.628133,0.907969,0.503826,0.570325,1,
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2,
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3,
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4,
E,0.190794,1.978757,2.605967,0.683509,5.156577,5,


In [64]:
df3     # atamadigimiz icin df3e silinmedi orjinal listede

Unnamed: 0,W,X,Y,Z,Multiplied,sample,empty,empty2
A,2.70685,0.628133,0.907969,0.503826,0.570325,1,,
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2,,
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3,,
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4,,
E,0.190794,1.978757,2.605967,0.683509,5.156577,5,,


In [65]:
df3= df3.drop('empty', axis= 1)
df3

Unnamed: 0,W,X,Y,Z,Multiplied,sample,empty2
A,2.70685,0.628133,0.907969,0.503826,0.570325,1,
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2,
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3,
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4,
E,0.190794,1.978757,2.605967,0.683509,5.156577,5,


In [72]:
df3.drop('empty2', axis= 1, inplace= True)
df3

Unnamed: 0,W,X,Y,Z,sample,Multiplied
A,2.70685,0.628133,0.907969,0.503826,1,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,2,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,3,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,4,0.708208
E,0.190794,1.978757,2.605967,0.683509,5,5.156577


In [73]:
df3.drop('Multiplied', axis= 1, inplace= True)
df3


Unnamed: 0,W,X,Y,Z,sample
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [74]:
df3['Multiplied']= df3['X'] * df3['Y']
df3['empty2']= np.nan
df3


Unnamed: 0,W,X,Y,Z,sample,Multiplied,empty2
A,2.70685,0.628133,0.907969,0.503826,1,0.570325,
B,0.651118,-0.319318,-0.848077,0.605965,2,0.270806,
C,-2.018168,0.740122,0.528813,-0.589001,3,0.391387,
D,0.188695,-0.758872,-0.933237,0.955057,4,0.708208,
E,0.190794,1.978757,2.605967,0.683509,5,5.156577,


In [75]:
df3.drop(['Multiplied', 'empty2'], axis= 1, inplace= True)
df3

Unnamed: 0,W,X,Y,Z,sample
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [78]:
df3.drop('sample', axis= 1, inplace= True)
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


 - ### Removing rows

In [79]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [80]:
df4= df3.drop('C', axis= 0)
df4

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [81]:
df5 = df3.drop('C')
df5

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Selecting Rows

- ### First, let's take a quick look at [`.loc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-loc.ipynb) | [`.iloc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-iloc.ipynb)

#### `.loc[]` → allows us to select data using **labels** (names) of rows (index) & columns

#### `.iloc[]` → allows us to select data using **index numbers** of rows (index) & columns. it's like classical indexing logic

In [92]:
m = np.random.randint(1, 40, size=(8, 4))
df4 = pd.DataFrame(m, columns = ["var1", "var2", "var3", 'var4'])
df4

Unnamed: 0,var1,var2,var3,var4
0,37,39,10,4
1,2,16,5,5
2,37,19,15,29
3,18,24,17,7
4,8,21,11,33
5,24,23,22,27
6,13,3,18,2
7,26,3,17,3


In [93]:
df4.loc[4]

var1     8
var2    21
var3    11
var4    33
Name: 4, dtype: int32

In [94]:
df4.loc[[4]]

Unnamed: 0,var1,var2,var3,var4
4,8,21,11,33


In [95]:
df4.iloc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,37,19,15,29
3,18,24,17,7
4,8,21,11,33


In [96]:
df4.loc[2:5]           # burda labellama yaptigimiz icin 5. satiri dahil ediyor 

Unnamed: 0,var1,var2,var3,var4
2,37,19,15,29
3,18,24,17,7
4,8,21,11,33
5,24,23,22,27


In [97]:
df4.index= 'a b c d e f g h'.split()
df4

Unnamed: 0,var1,var2,var3,var4
a,37,39,10,4
b,2,16,5,5
c,37,19,15,29
d,18,24,17,7
e,8,21,11,33
f,24,23,22,27
g,13,3,18,2
h,26,3,17,3


In [98]:
df4.iloc[[1]]

Unnamed: 0,var1,var2,var3,var4
b,2,16,5,5


In [99]:
df4.loc[['b']]

Unnamed: 0,var1,var2,var3,var4
b,2,16,5,5


In [100]:
df4

Unnamed: 0,var1,var2,var3,var4
a,37,39,10,4
b,2,16,5,5
c,37,19,15,29
d,18,24,17,7
e,8,21,11,33
f,24,23,22,27
g,13,3,18,2
h,26,3,17,3


In [101]:
df4.iloc[4,1]

21

In [102]:
df4.loc["d": "g"]

Unnamed: 0,var1,var2,var3,var4
d,18,24,17,7
e,8,21,11,33
f,24,23,22,27
g,13,3,18,2


In [103]:
df4.loc["d": "g"]["var3"]

d    17
e    11
f    22
g    18
Name: var3, dtype: int32

In [106]:
df4.loc["d": "g"][["var3"]]

Unnamed: 0,var3
d,17
e,11
f,22
g,18


In [104]:
df4.iloc[2:5, 2]

c    15
d    17
e    11
Name: var3, dtype: int32

In [105]:
df4.iloc[2:5, [2]]

Unnamed: 0,var3
c,15
d,17
e,11


#### Let's continue to examine `.loc[]` and `.iloc[]` using ``df3`` again

In [107]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [108]:
df3.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [109]:
df3.loc[['C']]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [110]:
df3.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [111]:
df3.iloc[[2]]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [112]:
df3.iloc[:, [2]]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


### Selecting subset of rows and columns

 - ### `.loc[[row labels|names], [column labels|names]]`

 - ### `.iloc[[row index numbers], [column index numbers]]`

In [114]:
df3.loc['C', 'Z']       # tek bir degere ulasmaya calisiyorum bu yuzden tek parantez kullaniyorum

-0.5890005332865824

In [115]:
df3.loc[['C'], ['Z']]      # burda dataframe olarak verdi 

Unnamed: 0,Z
C,-0.589001


In [116]:
df3.loc[['C', 'A'], ['Z', 'W']]

Unnamed: 0,Z,W
C,-0.589001,-2.018168
A,0.503826,2.70685


In [117]:
df3.iloc[[2, 0],[0, 3]]

Unnamed: 0,W,Z
C,-2.018168,-0.589001
A,2.70685,0.503826


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [118]:
df3 > 0.5

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,False,False,False,True
E,False,True,True,True


In [120]:
df3[df3> 0.5]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,,,,0.955057
E,,1.978757,2.605967,0.683509


In [121]:
df3[df3['Z']> 0.5]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [124]:
df3[df3['Y']> 0][['W', 'X']]

Unnamed: 0,W,X
A,2.70685,0.628133
C,-2.018168,0.740122
E,0.190794,1.978757


#### For two conditions you can use **|** → `or`,  **&** →  `and` with parenthesis:

In [127]:
df3[(df3['W'] > 0) & (df3['Y'] < 1)] = 0

In [128]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0.0,0.0,0.0
E,0.190794,1.978757,2.605967,0.683509


In [193]:
df3.index = pd.RangeIndex(start= 5, stop = 5+len(df3), step=1)

In [194]:
df3

Unnamed: 0,W,X,Y,Z,newindx
5,0.0,0.0,0.0,0.0,CA
6,0.0,0.0,0.0,0.0,NY
7,-2.018168,0.740122,0.528813,-0.589001,WY
8,0.0,0.0,0.0,0.0,OR
9,0.190794,1.978757,2.605967,0.683509,CO


In [197]:
df3.index = pd.RangeIndex(start= 5, stop = 9+len(df3), step=2)
df3

Unnamed: 0,W,X,Y,Z,newindx
5,0.0,0.0,0.0,0.0,CA
7,0.0,0.0,0.0,0.0,NY
9,-2.018168,0.740122,0.528813,-0.589001,WY
11,0.0,0.0,0.0,0.0,OR
13,0.190794,1.978757,2.605967,0.683509,CO


In [198]:
df3.index=[np.arange(1,6)]
df3

Unnamed: 0,W,X,Y,Z,newindx
1,0.0,0.0,0.0,0.0,CA
2,0.0,0.0,0.0,0.0,NY
3,-2.018168,0.740122,0.528813,-0.589001,WY
4,0.0,0.0,0.0,0.0,OR
5,0.190794,1.978757,2.605967,0.683509,CO


#### Conditional selection using ``.loc[]`` and ``.iloc[]``

In [129]:
df3.loc[(df3.X>0), ['X','Z']]       # satır : df3'ün X columnundaki 0'dan büyük değerlere sahip satırları,
                                    # sütun :  X ve Z sütunlarını getir.

Unnamed: 0,X,Z
C,0.740122,-0.589001
E,1.978757,0.683509


In [130]:
df3.loc[((df3['W'] > 1) | (df3.Y < 1)), ['Y','Z']]

Unnamed: 0,Y,Z
A,0.0,0.0
B,0.0,0.0
C,0.528813,-0.589001
D,0.0,0.0


## More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [131]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0.0,0.0,0.0
E,0.190794,1.978757,2.605967,0.683509


In [134]:
df3.reset_index()           

Unnamed: 0,index,W,X,Y,Z
0,A,0.0,0.0,0.0,0.0
1,B,0.0,0.0,0.0,0.0
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.0,0.0,0.0,0.0
4,E,0.190794,1.978757,2.605967,0.683509


In [135]:
df3.reset_index(drop= True)    #Reset to default 0,1,2,3.. n index

Unnamed: 0,W,X,Y,Z
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,-2.018168,0.740122,0.528813,-0.589001
3,0.0,0.0,0.0,0.0
4,0.190794,1.978757,2.605967,0.683509


In [136]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0.0,0.0,0.0
E,0.190794,1.978757,2.605967,0.683509


In [137]:
newindx = 'CA NY WY OR CO '.split()
newindx

['CA', 'NY', 'WY', 'OR', 'CO']

In [138]:
df3['newindx'] = newindx

In [139]:
df3

Unnamed: 0,W,X,Y,Z,newindx
A,0.0,0.0,0.0,0.0,CA
B,0.0,0.0,0.0,0.0,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.0,0.0,0.0,0.0,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [140]:
df3.set_index('newindx')

Unnamed: 0_level_0,W,X,Y,Z
newindx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.0,0.0,0.0,0.0
NY,0.0,0.0,0.0,0.0
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.0,0.0,0.0,0.0
CO,0.190794,1.978757,2.605967,0.683509


In [141]:
df3                                  # inspace yapmadigimz icin degismedi 

Unnamed: 0,W,X,Y,Z,newindx
A,0.0,0.0,0.0,0.0,CA
B,0.0,0.0,0.0,0.0,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.0,0.0,0.0,0.0,OR
E,0.190794,1.978757,2.605967,0.683509,CO


## Multi-Index and Index Hierarchy

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

In [142]:
outside = ['M1', 'M1', 'M1', 'M1', 'M1', 'M1', 'M2', 'M2', 'M2', 'M2', 'M2', 'M2']
inside = [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3]
third = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l']
multi_index = list(zip(outside, inside, third))
multi_index

[('M1', 1, 'a'),
 ('M1', 1, 'b'),
 ('M1', 2, 'c'),
 ('M1', 2, 'd'),
 ('M1', 3, 'e'),
 ('M1', 3, 'f'),
 ('M2', 1, 'g'),
 ('M2', 1, 'h'),
 ('M2', 2, 'i'),
 ('M2', 2, 'j'),
 ('M2', 3, 'k'),
 ('M2', 3, 'l')]

In [143]:
zip(outside, inside, third)

<zip at 0x1b608a5ce40>

In [144]:
list(zip(outside, inside, third))

[('M1', 1, 'a'),
 ('M1', 1, 'b'),
 ('M1', 2, 'c'),
 ('M1', 2, 'd'),
 ('M1', 3, 'e'),
 ('M1', 3, 'f'),
 ('M2', 1, 'g'),
 ('M2', 1, 'h'),
 ('M2', 2, 'i'),
 ('M2', 2, 'j'),
 ('M2', 3, 'k'),
 ('M2', 3, 'l')]

In [145]:
hier_index = pd.MultiIndex.from_tuples(multi_index)

In [146]:
hier_index

MultiIndex([('M1', 1, 'a'),
            ('M1', 1, 'b'),
            ('M1', 2, 'c'),
            ('M1', 2, 'd'),
            ('M1', 3, 'e'),
            ('M1', 3, 'f'),
            ('M2', 1, 'g'),
            ('M2', 1, 'h'),
            ('M2', 2, 'i'),
            ('M2', 2, 'j'),
            ('M2', 3, 'k'),
            ('M2', 3, 'l')],
           )

In [147]:
df5 = pd.DataFrame(np.random.randn(12, 4), index = hier_index, columns=['A', 'B', 'C', 'D'])
df5

Unnamed: 0,Unnamed: 1,Unnamed: 2,A,B,C,D
M1,1,a,-1.088266,-0.609936,-0.635511,-0.728053
M1,1,b,-0.594951,-0.593371,0.911416,-1.768073
M1,2,c,0.205247,0.211945,-1.240039,1.0493
M1,2,d,-0.410414,0.340754,0.805027,0.528188
M1,3,e,0.424096,-1.306117,-1.217277,1.167027
M1,3,f,-1.513392,0.06998,0.617859,-0.38239
M2,1,g,-0.129951,-0.196226,0.093173,-0.558562
M2,1,h,0.121411,2.120267,-1.448662,0.574885
M2,2,i,1.112591,-0.860594,-0.81475,-0.312264
M2,2,j,-1.518414,0.992552,-0.776338,0.274041


In [148]:
df5.loc['M1']

Unnamed: 0,Unnamed: 1,A,B,C,D
1,a,-1.088266,-0.609936,-0.635511,-0.728053
1,b,-0.594951,-0.593371,0.911416,-1.768073
2,c,0.205247,0.211945,-1.240039,1.0493
2,d,-0.410414,0.340754,0.805027,0.528188
3,e,0.424096,-1.306117,-1.217277,1.167027
3,f,-1.513392,0.06998,0.617859,-0.38239


In [150]:
df5.iloc[8]

A    1.112591
B   -0.860594
C   -0.814750
D   -0.312264
Name: (M2, 2, i), dtype: float64

In [151]:
df5

Unnamed: 0,Unnamed: 1,Unnamed: 2,A,B,C,D
M1,1,a,-1.088266,-0.609936,-0.635511,-0.728053
M1,1,b,-0.594951,-0.593371,0.911416,-1.768073
M1,2,c,0.205247,0.211945,-1.240039,1.0493
M1,2,d,-0.410414,0.340754,0.805027,0.528188
M1,3,e,0.424096,-1.306117,-1.217277,1.167027
M1,3,f,-1.513392,0.06998,0.617859,-0.38239
M2,1,g,-0.129951,-0.196226,0.093173,-0.558562
M2,1,h,0.121411,2.120267,-1.448662,0.574885
M2,2,i,1.112591,-0.860594,-0.81475,-0.312264
M2,2,j,-1.518414,0.992552,-0.776338,0.274041


In [152]:
df5.loc['M1'].loc[2]

Unnamed: 0,A,B,C,D
c,0.205247,0.211945,-1.240039,1.0493
d,-0.410414,0.340754,0.805027,0.528188


In [153]:
df5.loc['M1'].loc[2].iloc[[0]]

Unnamed: 0,A,B,C,D
c,0.205247,0.211945,-1.240039,1.0493


In [154]:
df5.index.names

FrozenList([None, None, None])

In [155]:
df5.index.names= ['Group', 'Num', 'Class']

In [156]:
df5.index.names

FrozenList(['Group', 'Num', 'Class'])

In [157]:
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D
Group,Num,Class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1,1,a,-1.088266,-0.609936,-0.635511,-0.728053
M1,1,b,-0.594951,-0.593371,0.911416,-1.768073
M1,2,c,0.205247,0.211945,-1.240039,1.0493
M1,2,d,-0.410414,0.340754,0.805027,0.528188
M1,3,e,0.424096,-1.306117,-1.217277,1.167027
M1,3,f,-1.513392,0.06998,0.617859,-0.38239
M2,1,g,-0.129951,-0.196226,0.093173,-0.558562
M2,1,h,0.121411,2.120267,-1.448662,0.574885
M2,2,i,1.112591,-0.860594,-0.81475,-0.312264
M2,2,j,-1.518414,0.992552,-0.776338,0.274041


### let's take a quick look at the [``.xs()``](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-xs.ipynb)

In [158]:
df5.xs('M1')             # xs() bir build in method olduğu için parametreleri normal parantez içinde kullanılır.

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Num,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,a,-1.088266,-0.609936,-0.635511,-0.728053
1,b,-0.594951,-0.593371,0.911416,-1.768073
2,c,0.205247,0.211945,-1.240039,1.0493
2,d,-0.410414,0.340754,0.805027,0.528188
3,e,0.424096,-1.306117,-1.217277,1.167027
3,f,-1.513392,0.06998,0.617859,-0.38239


In [159]:
df5.loc['M1']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Num,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,a,-1.088266,-0.609936,-0.635511,-0.728053
1,b,-0.594951,-0.593371,0.911416,-1.768073
2,c,0.205247,0.211945,-1.240039,1.0493
2,d,-0.410414,0.340754,0.805027,0.528188
3,e,0.424096,-1.306117,-1.217277,1.167027
3,f,-1.513392,0.06998,0.617859,-0.38239


In [160]:
df5.xs(['M1', 2])

  df5.xs(['M1', 2])


Unnamed: 0_level_0,A,B,C,D
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.205247,0.211945,-1.240039,1.0493
d,-0.410414,0.340754,0.805027,0.528188


In [162]:
df5.xs(('M2', 1, 'g'))

A   -0.129951
B   -0.196226
C    0.093173
D   -0.558562
Name: (M2, 1, g), dtype: float64

In [161]:
df5.xs(('M2', 1, 'g'), level=[0, 1, 2])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D
Group,Num,Class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M2,1,g,-0.129951,-0.196226,0.093173,-0.558562


### Let's learn new functions/attributes/methods on "iris dataset" 

In [163]:
import seaborn as sns

In [165]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [166]:
df = sns.load_dataset('iris')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [167]:
df.shape

(150, 5)

In [168]:
df.ndim

2

In [169]:
df.size

750

In [170]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [171]:
df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [172]:
df.sample()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
38,4.4,3.0,1.3,0.2,setosa


In [173]:
df.sample(7)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
102,7.1,3.0,5.9,2.1,virginica
107,7.3,2.9,6.3,1.8,virginica
134,6.1,2.6,5.6,1.4,virginica
120,6.9,3.2,5.7,2.3,virginica
141,6.9,3.1,5.1,2.3,virginica
48,5.3,3.7,1.5,0.2,setosa
109,7.2,3.6,6.1,2.5,virginica


In [176]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [178]:
df['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [179]:
df.mean()

  df.mean()


sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [180]:
df.sum(axis=0)

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [181]:
df.sum(axis=1)

  df.sum(axis=1)


0      10.2
1       9.5
2       9.4
3       9.4
4      10.2
       ... 
145    17.2
146    15.7
147    16.7
148    17.3
149    15.8
Length: 150, dtype: float64

In [182]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [183]:
df.isnull()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
145,False,False,False,False,False
146,False,False,False,False,False
147,False,False,False,False,False
148,False,False,False,False,False


In [184]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [187]:
len(df)

150

In [190]:
df.iloc[0:6]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa


In [191]:
df.loc[0:6]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa


In [192]:
df.loc[((df['species']=='setosa')&(df['sepal_length']>5)),['sepal_length','sepal_width', 'species']]

Unnamed: 0,sepal_length,sepal_width,species
0,5.1,3.5,setosa
5,5.4,3.9,setosa
10,5.4,3.7,setosa
14,5.8,4.0,setosa
15,5.7,4.4,setosa
16,5.4,3.9,setosa
17,5.1,3.5,setosa
18,5.7,3.8,setosa
19,5.1,3.8,setosa
20,5.4,3.4,setosa


# End of the Session