<h1><span style="text-align: center;">Pandas Lesson, Session - 4</span><h1>

# Data Frames

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
data = [1,3,5,7,9]
data

[1, 3, 5, 7, 9]

In [10]:
pd.DataFrame(data,columns=["col1"])

Unnamed: 0,col1
0,1
1,3
2,5
3,7
4,9


In [11]:
pd.Series(data)

0    1
1    3
2    5
3    7
4    9
dtype: int64

### Creating a DataFrame using a ``NumPy Arrays``

In [13]:
data2 = np.arange(1,24,2).reshape(3,4)
data2

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [14]:
pd.DataFrame(data2, columns=["col1","col2", "col3", "col4"])

Unnamed: 0,col1,col2,col3,col4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [17]:
df = pd.DataFrame(data2, columns=["col1","col2", "col3", "col4"], index=["a","b","c"])
df

Unnamed: 0,col1,col2,col3,col4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [18]:
type(df)

pandas.core.frame.DataFrame

In [19]:
df.size

12

In [20]:
df.shape

(3, 4)

In [21]:
df.ndim

2

In [22]:
df.index

Index(['a', 'b', 'c'], dtype='object')

In [23]:
df.columns

Index(['col1', 'col2', 'col3', 'col4'], dtype='object')

In [24]:
df.values

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [25]:
len(df)

3

In [27]:
df.keys()

Index(['col1', 'col2', 'col3', 'col4'], dtype='object')

In [28]:
df

Unnamed: 0,col1,col2,col3,col4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [29]:
df.col1

a     1
b     9
c    17
Name: col1, dtype: int32

In [30]:
df["col1"]

a     1
b     9
c    17
Name: col1, dtype: int32

### Creating a DataFrame using a ``dict``

In [32]:
s1 = np.random.randint(2,10, size = 40)
s2 = np.random.randint(30,100, size = 40)
s3 = np.random.randint(100,150, size = 40)
s1

array([3, 9, 2, 9, 2, 2, 6, 3, 5, 7, 8, 7, 5, 2, 2, 4, 4, 6, 6, 4, 6, 2,
       3, 6, 6, 9, 7, 9, 6, 8, 3, 2, 4, 7, 7, 8, 3, 8, 6, 3])

In [34]:
myDict = {'var1':s1, 'var2':s2, 'var3':s3}
myDict

{'var1': array([3, 9, 2, 9, 2, 2, 6, 3, 5, 7, 8, 7, 5, 2, 2, 4, 4, 6, 6, 4, 6, 2,
        3, 6, 6, 9, 7, 9, 6, 8, 3, 2, 4, 7, 7, 8, 3, 8, 6, 3]),
 'var2': array([84, 86, 84, 45, 68, 89, 79, 73, 55, 86, 64, 97, 52, 78, 79, 93, 87,
        76, 50, 57, 52, 30, 71, 39, 71, 94, 66, 62, 99, 83, 53, 95, 53, 72,
        74, 77, 72, 85, 68, 63]),
 'var3': array([144, 103, 113, 148, 103, 111, 142, 116, 114, 123, 118, 109, 125,
        129, 127, 139, 123, 105, 123, 112, 149, 123, 147, 129, 104, 137,
        113, 145, 135, 100, 125, 116, 126, 129, 119, 149, 134, 127, 102,
        117])}

In [36]:
df_dict = pd.DataFrame(myDict)
df_dict

Unnamed: 0,var1,var2,var3
0,3,84,144
1,9,86,103
2,2,84,113
3,9,45,148
4,2,68,103
5,2,89,111
6,6,79,142
7,3,73,116
8,5,55,114
9,7,86,123


In [37]:
df_dict[3:5]

Unnamed: 0,var1,var2,var3
3,9,45,148
4,2,68,103


In [38]:
df_dict2 = df_dict[3:5]
df_dict2

Unnamed: 0,var1,var2,var3
3,9,45,148
4,2,68,103


In [39]:
df_dict2.index = ["a","b"]
df_dict2

Unnamed: 0,var1,var2,var3
a,9,45,148
b,2,68,103


In [42]:
df_dict2.columns = ["X","Y","Z"]
df_dict2

Unnamed: 0,X,Y,Z
a,9,45,148
b,2,68,103


In [44]:
pd.DataFrame(df_dict2.values, index=["c","d"],columns=["t","r","s"])

Unnamed: 0,t,r,s
c,9,45,148
d,2,68,103


### ***idexing, selection*** and ***slicing*** methods and several ***attributes*** using a different DataFrame

In [57]:
np.random.seed(101)
np.random.randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [58]:
'A B C D E'.split(" ")

['A', 'B', 'C', 'D', 'E']

In [61]:
np.random.seed(101)
df3 = pd.DataFrame(  np.random.randn(5,4), 
                   index = ['A', 'B', 'C', 'D', 'E'], 
                   columns = 'W X Y Z'.split())
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [62]:
df3["W"]

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [63]:
df3.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [64]:
df3[["W"]]

Unnamed: 0,W
A,2.70685
B,0.651118
C,-2.018168
D,0.188695
E,0.190794


In [66]:
df3[["W","Y"]]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813
D,0.188695,-0.933237
E,0.190794,2.605967


In [72]:
df3.loc[ : ,"W":"Y"]

Unnamed: 0,W,X,Y
A,2.70685,0.628133,0.907969
B,0.651118,-0.319318,-0.848077
C,-2.018168,0.740122,0.528813
D,0.188695,-0.758872,-0.933237
E,0.190794,1.978757,2.605967


In [73]:
df3.loc[ "B":"D" ,"W":"Y"]

Unnamed: 0,W,X,Y
B,0.651118,-0.319318,-0.848077
C,-2.018168,0.740122,0.528813
D,0.188695,-0.758872,-0.933237


In [74]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [75]:
df3["X"]

A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [76]:
df3["Y"]

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [79]:
df3["X"] +  df3["Y"]**2

A    1.452541
B    0.399917
C    1.019766
D    0.112060
E    8.769823
dtype: float64

In [80]:
df3["Result"] = df3["W"]*df3["X"] + df3["Y"]**2 - df3["Z"]
df3

Unnamed: 0,W,X,Y,Z,Result
A,2.70685,0.628133,0.907969,0.503826,2.020844
B,0.651118,-0.319318,-0.848077,0.605965,-0.094644
C,-2.018168,0.740122,0.528813,-0.589001,-0.625047
D,0.188695,-0.758872,-0.933237,0.955057,-0.22732
E,0.190794,1.978757,2.605967,0.683509,6.485092


In [91]:
df3["Result"] = df3["W"]*df3["X"] + df3["Y"].std() - df3["Z"].mean() + 100
df3

Unnamed: 0,W,X,Y,Z,Result
A,2.70685,0.628133,0.907969,0.503826,102.722906
B,0.651118,-0.319318,-0.848077,0.605965,100.814731
C,-2.018168,0.740122,0.528813,-0.589001,99.528954
D,0.188695,-0.758872,-0.933237,0.955057,100.879449
E,0.190794,1.978757,2.605967,0.683509,101.40018


In [85]:
df3["Z"].mean()

0.43187119296667903

In [86]:
df3["Y"].std() 

1.4545158461817913

In [92]:
df3

Unnamed: 0,W,X,Y,Z,Result
A,2.70685,0.628133,0.907969,0.503826,102.722906
B,0.651118,-0.319318,-0.848077,0.605965,100.814731
C,-2.018168,0.740122,0.528813,-0.589001,99.528954
D,0.188695,-0.758872,-0.933237,0.955057,100.879449
E,0.190794,1.978757,2.605967,0.683509,101.40018


In [94]:
df3.drop(["Y","Result"],axis=1)

Unnamed: 0,W,X,Z
A,2.70685,0.628133,0.503826
B,0.651118,-0.319318,0.605965
C,-2.018168,0.740122,-0.589001
D,0.188695,-0.758872,0.955057
E,0.190794,1.978757,0.683509


In [95]:
df3.drop(["Y","Result"],axis=1, inplace=True)

In [96]:
df3

Unnamed: 0,W,X,Z
A,2.70685,0.628133,0.503826
B,0.651118,-0.319318,0.605965
C,-2.018168,0.740122,-0.589001
D,0.188695,-0.758872,0.955057
E,0.190794,1.978757,0.683509


In [90]:
# df3.drop("Result",axis=1, inplace=True)
df3 =  df3.drop("Result",axis=1)
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [98]:
df3.drop(["C","E"],axis=0, inplace=True)

In [99]:
df3

Unnamed: 0,W,X,Z
A,2.70685,0.628133,0.503826
B,0.651118,-0.319318,0.605965
D,0.188695,-0.758872,0.955057


In [100]:
np.random.seed(101)
df3 = pd.DataFrame(  np.random.randn(5,4), 
                   index = ['A', 'B', 'C', 'D', 'E'], 
                   columns = 'W X Y Z'.split())
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


#### `.loc[]` → allows us to select data using **labels** (names) of rows (index) & columns

#### `.iloc[]` → allows us to select data using **index numbers** of rows (index) & columns. it's like classical indexing logic

In [101]:
m=np.random.randint(1,40, size=(8,4))
df4 = pd.DataFrame(m, columns = ["var1","var2","var3",'var4'])
df4

Unnamed: 0,var1,var2,var3,var4
0,8,11,39,10
1,19,8,16,1
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23
7,10,3,19,29


In [102]:
df4.loc[2]

var1    13
var2    18
var3    12
var4    16
Name: 2, dtype: int32

In [103]:
df4.loc[2:6]

Unnamed: 0,var1,var2,var3,var4
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23


In [104]:
df4.index =   'a b c d e f g h'.split()
df4

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [106]:
df4.loc["c"]

var1    13
var2    18
var3    12
var4    16
Name: c, dtype: int32

In [108]:
df4.loc["c":"f"]

Unnamed: 0,var1,var2,var3,var4
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23


In [111]:
df4.loc["c":"f" , "var1":"var3"]

Unnamed: 0,var1,var2,var3
c,13,18,12
d,34,30,25
e,20,36,31
f,21,28,9


In [119]:
df4.loc["c":"f"][["var1","var2","var3"]]

Unnamed: 0,var1,var2,var3
c,13,18,12
d,34,30,25
e,20,36,31
f,21,28,9


In [122]:
df4

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [123]:
df4.iloc[1:4,2:4]

Unnamed: 0,var3,var4
b,16,1
c,12,16
d,25,37


In [129]:
df4.iloc[1:4,2:3]

Unnamed: 0,var3
b,16
c,12
d,25


In [130]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [136]:
df3.iloc[ : ,2:]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
C,0.528813,-0.589001
D,-0.933237,0.955057
E,2.605967,0.683509


In [137]:
df3.iloc[ 1 ,2:]

Y   -0.848077
Z    0.605965
Name: B, dtype: float64

### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [138]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [139]:
df3["X"] > 0

A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool

In [140]:
df3[ df3["X"] > 0 ]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [142]:
df3[ (df3["X"] > 0) & (df3["Z"]>0) ]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
E,0.190794,1.978757,2.605967,0.683509


In [143]:
df3[ (df3.X > 0) & (df3.Z>0) ]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
E,0.190794,1.978757,2.605967,0.683509


In [144]:
df3["X"] > 0

A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool

In [145]:
df3[ [True,False,True,False,True] ]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [148]:
# df3[ [True,False,True,False] ] # index yani len sayısı kadar eleman gönder

In [154]:
df3[ df3.Z > 0   ]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [153]:
df3[ df3.Z > 0   ][["W"]]

Unnamed: 0,W
A,2.70685
B,0.651118
D,0.188695
E,0.190794


In [155]:
df_3 = df3[ df3.Z > 0   ]
df_3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [156]:
df_3["W"]

A    2.706850
B    0.651118
D    0.188695
E    0.190794
Name: W, dtype: float64

In [158]:
(  df3[ df3.Z > 0  ]  ) .iloc[2:5,:]

Unnamed: 0,W,X,Y,Z
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509
