# **Data Frames**

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index.Composed of features and observations.

pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

Two-dimensional, size-mutable, potentially heterogeneous tabular data.



In [252]:
import numpy as np
import pandas as pd

In [None]:
data = [1, 3, 5, 7, 9, 18]   
data

[1, 3, 5, 7, 9, 18]

In [None]:
pd.DataFrame(data)

Unnamed: 0,0
0,1
1,3
2,5
3,7
4,9
5,18


In [None]:
pd.Series(data)

0     1
1     3
2     5
3     7
4     9
5    18
dtype: int64

In [None]:
pd.DataFrame(data,columns=["column1"])

Unnamed: 0,column1
0,1
1,3
2,5
3,7
4,9
5,18


 **Creating a DataFrame using a NumPy Arrays**

In [None]:
m = np.arange(1,24,2).reshape(3,4)
m

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [None]:
df=pd.DataFrame(data=m, columns=['var1','var2','var3','var4'])
df

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [None]:
df.head(2)

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15


In [None]:
df.tail(1)

Unnamed: 0,var1,var2,var3,var4
2,17,19,21,23


In [None]:
df.sample(2)

Unnamed: 0,var1,var2,var3,var4
2,17,19,21,23
0,1,3,5,7


In [None]:
df.columns

Index(['var1', 'var2', 'var3', 'var4'], dtype='object')

In [None]:
for i in df.columns:   #to make same operation in each column
  print(df[i].sum()) 


27
33
39
45


In [None]:
df.columns=['new1','new2','new3','new4']
df

Unnamed: 0,new1,new2,new3,new4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [None]:
df.index = ["a","b","c"]   #change index names
df

Unnamed: 0,new1,new2,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [None]:
df.rename(columns={"new1":"a", "new2":"b"}) #not permanent.to be permanent use inplace=True

Unnamed: 0,a,b,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [None]:
df.rename(index = {'a': 1,'b':2}) #to permanent change use inplace=True

Unnamed: 0,new1,new2,new3,new4
1,1,3,5,7
2,9,11,13,15
c,17,19,21,23


In [None]:
df.shape

(3, 4)

In [None]:
df.shape[0]

3

In [None]:
df.shape[1]

4

In [None]:
df.ndim  #always two

2

In [None]:
df.size

12

In [None]:
len(df)  #lenght rows

3

In [None]:
df.values   

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [None]:
type(df.values)

numpy.ndarray

**Creating a DataFrame using a dict**

In [None]:
s1 = np.random.randint(2,10, size = 4)
s2 = np.random.randint(3,10, size = 4)
s3 = np.random.randint(4,15, size = 4)

In [None]:
s1

array([7, 6, 6, 6])

In [None]:
s2

array([5, 8, 5, 7])

In [None]:
s3

array([6, 4, 9, 8])

In [None]:
mydict = {'var1':s1,'var2':s2,'var3':s3}

In [None]:
df1 = pd.DataFrame(mydict)
df1

Unnamed: 0,var1,var2,var3
0,7,5,6
1,6,8,4
2,6,5,9
3,6,7,8


In [None]:
df1.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
[i for i in df1.index]

[0, 1, 2, 3]

In [None]:
"var2" in df1

True

In [None]:
"var5" in df1

False

In [None]:
df1

Unnamed: 0,var1,var2,var3
0,7,5,6
1,6,8,4
2,6,5,9
3,6,7,8


**Indexing, selection and slicing methods and several attributes using a different DataFrame**

In [None]:
from numpy.random import randn   #instead of np.random.rand 

In [209]:
np.random.seed(101)  #seed fixing is for same random numbers in every try
df3 = pd.DataFrame(randn(5,4), index = ("A", "B", "C", "D", "E"), columns = ("W","X","Y","Z"))
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
# creating a DataFrame by "positional arguments"
pd.DataFrame(randn(5,4), 'a b c d e'.split(), 'w x y z'.split())

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
b,-0.134841,0.390528,0.166905,0.184502
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752
e,-0.116773,1.901755,0.238127,1.996652


In [None]:
# creating a DataFrame by "keyword arguments"
pd.DataFrame(randn(5,4),columns='w x y z'.split(), index='a b c d e'.split())

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212
e,0.992573,1.192241,-1.04678,1.292765


# **Selection and Indexing**

In [64]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [65]:
df3['W']    #Series

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [66]:
type(df3['W'])

pandas.core.series.Series

In [67]:
df3[['W']]   #Dataframe hint: pair brackets(in list)

Unnamed: 0,W
A,2.70685
B,0.651118
C,-2.018168
D,0.188695
E,0.190794


In [71]:
type(df3[['W']])  #Dataframe hint: pair brackets(in list)

pandas.core.frame.DataFrame

In [72]:
df3.W      #while making assigned operations sometimes not working

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [75]:
df3[["W","Z"]]   #Recall multiple columns

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [76]:
df3["W":"Y"]   #Slicing is for indexing.W and Y arent an index.So returns empty.

Unnamed: 0,W,X,Y,Z


In [77]:
df3["A":"C"]  #Slicing works on indexes.

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001


In [78]:
df3["A":"C"]["W"]   #Intersection of related index,column

A    2.706850
B    0.651118
C   -2.018168
Name: W, dtype: float64

In [None]:
df3["A":"C"][["W","Y"]]  # pair bracket(in list) if columns is more than one.

**Creating a new column**

In [79]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [105]:
df3["T"]= [1,2,3,4,5]
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [106]:
df3["X*Y"] = df3["X"]*df3["Y"]
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [107]:
df3["new"] = df3["Z"]*df3["W"]
df3

Unnamed: 0,W,X,Y,Z,X*Y,T,new
A,2.70685,0.628133,0.907969,0.503826,0.570325,1,1.363781
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2,0.394555
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3,1.188702
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4,0.180215
E,0.190794,1.978757,2.605967,0.683509,5.156577,5,0.13041


In [108]:
df3["new2"] = df3["Z"]*df3["X"]
df3

Unnamed: 0,W,X,Y,Z,X*Y,T,new,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,1,1.363781,0.316469
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2,0.394555,-0.193496
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3,1.188702,-0.435932
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4,0.180215,-0.724766
E,0.190794,1.978757,2.605967,0.683509,5.156577,5,0.13041,1.352498


**Removing Columns & Rows**

In [110]:
df3.drop(columns=["new","new2"],inplace=True)
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [111]:
df3.columns.sort_values(ascending=False)

Index(['Z', 'Y', 'X*Y', 'X', 'W', 'T'], dtype='object')

In [112]:
df3.drop(["X*Y", "T"], axis=1)  #not positional arg,so give axis

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [113]:
df3                #need inplace=True for permanent change

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [114]:
df3.drop(["X*Y", "T"], axis=1, inplace=True)

In [115]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [116]:
df3.drop("C",axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [118]:
df3.drop(index=["C"])

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [120]:
df3.drop("C")    #axis=0 is default

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


**Selecting Rows**

**.loc[]** → allows us to select data using labels (names) of rows (index) & columns.USES NAMES(numeric and str)

**.iloc[]** → allows us to select data using index numbers of rows (index) & columns. it's like classical indexing logic.(USES INDEXES)

In [122]:
m = np.random.randint(1,40, size=(8,4))
df4 = pd.DataFrame(m, columns = ["var1","var2","var3",'var4'])
df4

Unnamed: 0,var1,var2,var3,var4
0,26,3,17,3
1,25,16,30,7
2,29,23,21,26
3,34,8,9,17
4,30,1,17,39
5,36,26,32,12
6,9,12,13,16
7,23,38,3,39


In [123]:
df4.loc[4]

var1    30
var2     1
var3    17
var4    39
Name: 4, dtype: int64

In [124]:
df4.loc[[4]]       

Unnamed: 0,var1,var2,var3,var4
4,30,1,17,39


In [133]:
df4.loc[2:5]       # loc is not used with index slicing

TypeError: ignored

In [126]:
df4.iloc[2:5]  # iloc makes index slicing

Unnamed: 0,var1,var2,var3,var4
2,29,23,21,26
3,34,8,9,17
4,30,1,17,39


In [127]:
df4

Unnamed: 0,var1,var2,var3,var4
0,26,3,17,3
1,25,16,30,7
2,29,23,21,26
3,34,8,9,17
4,30,1,17,39
5,36,26,32,12
6,9,12,13,16
7,23,38,3,39


In [129]:
df4.index = "a b c d e f g h".split()
df4

Unnamed: 0,var1,var2,var3,var4
a,26,3,17,3
b,25,16,30,7
c,29,23,21,26
d,34,8,9,17
e,30,1,17,39
f,36,26,32,12
g,9,12,13,16
h,23,38,3,39


In [145]:
df4.loc["c":"g"]      # loc is used with names-string slicing

Unnamed: 0,var1,var2,var3,var4
c,29,23,21,26
d,34,8,9,17
e,30,1,17,39
f,36,26,32,12
g,9,12,13,16


In [147]:
df4.iloc['c':'g']     # iloc not used with names-string slicing

TypeError: ignored

In [141]:
df4

Unnamed: 0,var1,var2,var3,var4
a,26,3,17,3
b,25,16,30,7
c,29,23,21,26
d,34,8,9,17
e,30,1,17,39
f,36,26,32,12
g,9,12,13,16
h,23,38,3,39


In [142]:
df4.iloc[4,1]

1

In [148]:
df4.loc['c':'g',"var3"]

c    21
d     9
e    17
f    32
g    13
Name: var3, dtype: int64

In [150]:
df4.loc['c':'g']['var3']

c    21
d     9
e    17
f    32
g    13
Name: var3, dtype: int64

In [154]:
#select these data as a DataFrame not a series

df4.loc['c':'g'][['var3']]

Unnamed: 0,var3
c,21
d,9
e,17
f,32
g,13


In [155]:
df4.loc['c':'g', ["var3"]]

Unnamed: 0,var3
c,21
d,9
e,17
f,32
g,13


In [156]:
df4.iloc[2:5,2]

c    21
d     9
e    17
Name: var3, dtype: int64

In [157]:
df4.iloc[2:5][['var2']]

Unnamed: 0,var2
c,23
d,8
e,1


# **Selecting subset of rows and columns**

 .loc[[row labels|names], [column labels|names]]

 .iloc[[row index numbers], [column index numbers]]

In [158]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [159]:
df3.loc["C","Z"]

-0.5890005332865824

In [160]:
# let's select the same data as a DataFrame
df3.loc[['C'],['Z']]

Unnamed: 0,Z
C,-0.589001


In [163]:
df3.loc[["A","C"],["W","Z"]] #with names. intersection A wİth W and Z.

Unnamed: 0,W,Z
A,2.70685,0.503826
C,-2.018168,-0.589001


In [164]:
df3.iloc[[0,2],[0,3]]        #with indexes

Unnamed: 0,W,Z
A,2.70685,0.503826
C,-2.018168,-0.589001


# **Conditional Selection**

In [165]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [166]:
# returns a DataFrame consists of bool type
df3>0.5

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,False,False,False,True
E,False,True,True,True


In [167]:
df3[df3>0.5]       #Returns based on Dataframe.

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,,,,0.955057
E,,1.978757,2.605967,0.683509


In [169]:
df3[df3["Z"]<0.5]    # Returns based on rows.

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [170]:
df3[['Z']]

Unnamed: 0,Z
A,0.503826
B,0.605965
C,-0.589001
D,0.955057
E,0.683509


In [172]:
df3[df3["X"]<1][["W"]]   #[ROW,COL],[ROW][COL]

Unnamed: 0,W
A,2.70685
B,0.651118
C,-2.018168
D,0.188695


In [175]:
df3[df3["Y"]>0][["Z","W","Y"]]

Unnamed: 0,Z,W,Y
A,0.503826,2.70685,0.907969
C,-0.589001,-2.018168,0.528813
E,0.683509,0.190794,2.605967


**For two conditions you can use | → or, & → and with parenthesis:**

In [176]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [210]:
df3[(df3["W"]>0) & (df3["Y"]<1)] = 0       #May not work some loops.So loc can be used.
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0.0,0.0,0.0
E,0.190794,1.978757,2.605967,0.683509


**Conditional selection using .loc[] and .iloc[]**

In [181]:
df3.loc[(df3.X>0),["W","Z"]]

Unnamed: 0,W,Z
C,-2.018168,-0.589001
E,0.190794,0.683509


In [182]:
df3.loc[((df3.W>1) | (df3.Y<1)), ['Y','Z']]

Unnamed: 0,Y,Z
A,0.0,0.0
B,0.0,0.0
C,0.528813,-0.589001
D,0.0,0.0


In [212]:
df3.loc[((df3.W>1) | (df3.Y<1)), ['Y','Z']] = 1

In [213]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,1.0,1.0
B,0.0,0.0,1.0,1.0
C,-2.018168,0.740122,1.0,1.0
D,0.0,0.0,1.0,1.0
E,0.190794,1.978757,2.605967,0.683509


# **More Index Details**

In [214]:
# Reset to default 0,1...n index
df3.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.0,0.0,1.0,1.0
1,B,0.0,0.0,1.0,1.0
2,C,-2.018168,0.740122,1.0,1.0
3,D,0.0,0.0,1.0,1.0
4,E,0.190794,1.978757,2.605967,0.683509


In [215]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,1.0,1.0
B,0.0,0.0,1.0,1.0
C,-2.018168,0.740122,1.0,1.0
D,0.0,0.0,1.0,1.0
E,0.190794,1.978757,2.605967,0.683509


In [216]:
df3['newidx']=newindx

In [217]:
df3

Unnamed: 0,W,X,Y,Z,newidx
A,0.0,0.0,1.0,1.0,CA
B,0.0,0.0,1.0,1.0,NY
C,-2.018168,0.740122,1.0,1.0,WY
D,0.0,0.0,1.0,1.0,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [218]:
df3.set_index('newidx',inplace=True)

In [219]:
df3

Unnamed: 0_level_0,W,X,Y,Z
newidx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.0,0.0,1.0,1.0
NY,0.0,0.0,1.0,1.0
WY,-2.018168,0.740122,1.0,1.0
OR,0.0,0.0,1.0,1.0
CO,0.190794,1.978757,2.605967,0.683509


# **Multi-Index and Index Hierarchy**

In [253]:
# Index Levels
outside = ['M1', 'M1', 'M1', 'M2', 'M2', 'M2','M3', 'M3', 'M3']
inside = [1, 2, 3, 1, 2, 3, 5, 6, 7]
multi_index = list(zip(outside, inside))
multi_index

[('M1', 1),
 ('M1', 2),
 ('M1', 3),
 ('M2', 1),
 ('M2', 2),
 ('M2', 3),
 ('M3', 5),
 ('M3', 6),
 ('M3', 7)]

In [222]:
new_index=pd.MultiIndex.from_tuples(multi_index)
new_index

MultiIndex([('M1', 1),
            ('M1', 2),
            ('M1', 3),
            ('M2', 1),
            ('M2', 2),
            ('M2', 3),
            ('M3', 5),
            ('M3', 6),
            ('M3', 7)],
           )

In [225]:
df=pd.DataFrame(np.random.randn(9,4), index = new_index, columns=['A','B','C','D'])
df

Unnamed: 0,Unnamed: 1,A,B,C,D
M1,1,0.641806,-0.9051,-0.391157,1.028293
M1,2,-1.972605,-0.866885,0.720788,-1.223082
M1,3,1.60678,-1.11571,-1.385379,-1.32966
M2,1,0.04146,-0.411055,-0.771329,0.110477
M2,2,-0.804652,0.253548,0.649148,0.358941
M2,3,-1.080471,0.902398,0.161781,0.833029
M3,5,0.97572,-0.388239,0.783316,-0.708954
M3,6,0.586847,-1.621348,0.677535,0.026105
M3,7,-1.678284,0.333973,-0.532471,2.117727


In [226]:
df.loc["M1"]

Unnamed: 0,A,B,C,D
1,0.641806,-0.9051,-0.391157,1.028293
2,-1.972605,-0.866885,0.720788,-1.223082
3,1.60678,-1.11571,-1.385379,-1.32966


In [229]:
df.loc["M1"].loc[2]  #takes 2 index as a name.Not making indexing here.

A   -1.972605
B   -0.866885
C    0.720788
D   -1.223082
Name: 2, dtype: float64

In [230]:
df.loc['M1'].loc[[2]]

Unnamed: 0,A,B,C,D
2,-1.972605,-0.866885,0.720788,-1.223082


In [231]:
df.index.names   #there arent index names

FrozenList([None, None])

In [232]:
df.index.names = ['Group','Num']   # assign index names

In [233]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,0.641806,-0.9051,-0.391157,1.028293
M1,2,-1.972605,-0.866885,0.720788,-1.223082
M1,3,1.60678,-1.11571,-1.385379,-1.32966
M2,1,0.04146,-0.411055,-0.771329,0.110477
M2,2,-0.804652,0.253548,0.649148,0.358941
M2,3,-1.080471,0.902398,0.161781,0.833029
M3,5,0.97572,-0.388239,0.783316,-0.708954
M3,6,0.586847,-1.621348,0.677535,0.026105
M3,7,-1.678284,0.333973,-0.532471,2.117727


**A quick look at the .xs()**

Easy operations on multi_index

In [234]:
df.xs('M1')

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.641806,-0.9051,-0.391157,1.028293
2,-1.972605,-0.866885,0.720788,-1.223082
3,1.60678,-1.11571,-1.385379,-1.32966


In [235]:
df.loc['M1']

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.641806,-0.9051,-0.391157,1.028293
2,-1.972605,-0.866885,0.720788,-1.223082
3,1.60678,-1.11571,-1.385379,-1.32966


In [237]:
df.xs(("M1",2))    #df.loc["M1"].loc[2]

A   -1.972605
B   -0.866885
C    0.720788
D   -1.223082
Name: (M1, 2), dtype: float64

In [239]:
df.xs(('M1',2), level=[0,1])   #level 0 = M Groups, level 1 = Numbers

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,-1.972605,-0.866885,0.720788,-1.223082


In [240]:
df.xs(('M1',2), level=["Group","Num"])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,-1.972605,-0.866885,0.720788,-1.223082


In [241]:
#df.loc[2] #gives an error
#df.xs(2) #gives an error
#more than one İndex 2,on level of "num"

df.xs(2, level = 'Num')     

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,-1.972605,-0.866885,0.720788,-1.223082
M2,-0.804652,0.253548,0.649148,0.358941


In [242]:
df.xs(5, level = 'Num')

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3,0.97572,-0.388239,0.783316,-0.708954


In [247]:
df.xs(5, level = 1)

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3,0.97572,-0.388239,0.783316,-0.708954


In [254]:
df["C"]

Group  Num
M1     1     -0.391157
       2      0.720788
       3     -1.385379
M2     1     -0.771329
       2      0.649148
       3      0.161781
M3     5      0.783316
       6      0.677535
       7     -0.532471
Name: C, dtype: float64

In [248]:
df.xs('C', axis=1)   # C in columns

Group  Num
M1     1     -0.391157
       2      0.720788
       3     -1.385379
M2     1     -0.771329
       2      0.649148
       3      0.161781
M3     5      0.783316
       6      0.677535
       7     -0.532471
Name: C, dtype: float64

**New functions/attributes/methods on "iris dataset"**

In [255]:
import seaborn as sns

In [257]:
df=sns.load_dataset("iris")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [258]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [259]:
df.shape

(150, 5)

In [260]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [261]:
df.sample(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
72,6.3,2.5,4.9,1.5,versicolor
91,6.1,3.0,4.6,1.4,versicolor
2,4.7,3.2,1.3,0.2,setosa
84,5.4,3.0,4.5,1.5,versicolor


In [262]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [263]:
# df.describe().T
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal_length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal_width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [264]:
df.describe(include = "all")   #"number" and "object" can be used as include/exclude parameter

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,setosa
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [265]:
df.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.11757,0.871754,0.817941
sepal_width,-0.11757,1.0,-0.42844,-0.366126
petal_length,0.871754,-0.42844,1.0,0.962865
petal_width,0.817941,-0.366126,0.962865,1.0


In [266]:
df.corr()[["sepal_length"]]

Unnamed: 0,sepal_length
sepal_length,1.0
sepal_width,-0.11757
petal_length,0.871754
petal_width,0.817941


In [267]:
df["petal_length"].corr(df["petal_width"])

0.9628654314027961

In [268]:
df.species.value_counts(dropna=False) 

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [269]:
df.species.value_counts(dropna=False,normalize=True)

setosa        0.333333
versicolor    0.333333
virginica     0.333333
Name: species, dtype: float64

In [279]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [275]:
df.mean()

  """Entry point for launching an IPython kernel.


sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [276]:
df.sum(axis=0)

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [277]:
df.sum(axis=1)

  """Entry point for launching an IPython kernel.


0      10.2
1       9.5
2       9.4
3       9.4
4      10.2
       ... 
145    17.2
146    15.7
147    16.7
148    17.3
149    15.8
Length: 150, dtype: float64

In [278]:
df.sepal_length.sum()

876.5

In [270]:
df.species.nunique()

3

In [271]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [272]:
df.loc[df["species"]=="setosa","sepal_length"]

0     5.1
1     4.9
2     4.7
3     4.6
4     5.0
5     5.4
6     4.6
7     5.0
8     4.4
9     4.9
10    5.4
11    4.8
12    4.8
13    4.3
14    5.8
15    5.7
16    5.4
17    5.1
18    5.7
19    5.1
20    5.4
21    5.1
22    4.6
23    5.1
24    4.8
25    5.0
26    5.0
27    5.2
28    5.2
29    4.7
30    4.8
31    5.4
32    5.2
33    5.5
34    4.9
35    5.0
36    5.5
37    4.9
38    4.4
39    5.1
40    5.0
41    4.5
42    4.4
43    5.0
44    5.1
45    4.8
46    5.1
47    4.6
48    5.3
49    5.0
Name: sepal_length, dtype: float64

In [273]:
df[(df.sepal_length>4) & (df.sepal_length<5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa
11,4.8,3.4,1.6,0.2,setosa
12,4.8,3.0,1.4,0.1,setosa
13,4.3,3.0,1.1,0.1,setosa
22,4.6,3.6,1.0,0.2,setosa


In [274]:
df[(df.species == "virginica") & (df.sepal_length>4)  & (df.sepal_length<5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
106,4.9,2.5,4.5,1.7,virginica
