In [None]:
# vThis notebook is about Pandas fudnamentals and particularly indexing, selecting and assigning

In [2]:
#Import package
import pandas as pd

# Native accessors

In [4]:
#Reading CSV files on existing DataFrames
wine = pd.read_csv("winequality_merged.csv")
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [None]:
# There are two ways of selecting specific Series out of a DataFrame:
#1) accessing directly an attribute (column),
#2) accessing values using indexing 

In [5]:
# Accessing the property of an object by accessing it as an attribute such as columns on the DataFrame.
wine.chlorides

0       0.076
1       0.098
2       0.092
3       0.075
4       0.076
        ...  
6492    0.039
6493    0.047
6494    0.041
6495    0.022
6496    0.020
Name: chlorides, Length: 6497, dtype: float64

In [8]:
#Accessing the DataFrame as accessing a dictionary based on indexing [''] operator.
wine['free sulfur dioxide']

0       11.0
1       25.0
2       15.0
3       17.0
4       11.0
        ... 
6492    24.0
6493    57.0
6494    30.0
6495    20.0
6496    22.0
Name: free sulfur dioxide, Length: 6497, dtype: float64

In [9]:
#Accessing a single specific value using the dictionary property.
wine['pH'][2]

3.26

# Indexing in Pandas

In [None]:
#Pandas has its own accessor operator: loc and iloc

### Index-based selection

In [10]:
# iloc. 
#In the example below the : operator means all the rows, an the number to select the column indexed as 2 (in this case" citric acid)
wine.iloc[:, 2]


0       0.00
1       0.00
2       0.04
3       0.56
4       0.00
        ... 
6492    0.29
6493    0.36
6494    0.19
6495    0.30
6496    0.38
Name: citric acid, Length: 6497, dtype: float64

In [11]:
#To select particular entries
wine.iloc[1:4, 4]

1    0.098
2    0.092
3    0.075
Name: chlorides, dtype: float64

In [12]:
# To pass a list
wine.iloc[[0,1,3,7], 6]

0    34.0
1    67.0
3    60.0
7    21.0
Name: total sulfur dioxide, dtype: float64

In [14]:
# Using negative numbers in the selection to access from the bottom to the top of the DataFrame
wine.iloc[-10:]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
6487,6.8,0.22,0.36,1.2,0.052,38.0,127.0,0.9933,3.04,0.54,9.2,5,0
6488,4.9,0.235,0.27,11.75,0.03,34.0,118.0,0.9954,3.07,0.5,9.4,6,0
6489,6.1,0.34,0.29,2.2,0.036,25.0,100.0,0.98938,3.06,0.44,11.8,6,0
6490,5.7,0.21,0.32,0.9,0.038,38.0,121.0,0.99074,3.24,0.46,10.6,6,0
6491,6.5,0.23,0.38,1.3,0.032,29.0,112.0,0.99298,3.29,0.54,9.7,5,0
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0
6496,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6,0


### Index-based selection

In [15]:
# loc.
#The data index value is what matters, no its position (as in iloc). 
#Loc is simpler than iloc because ignores the dataset's indices.
#Contrary, uses the information in the indices to its work
wine.loc[3, 'density']

0.998

In [16]:
wine.loc[:, ['alcohol', 'volatile acidity', 'quality']]

Unnamed: 0,alcohol,volatile acidity,quality
0,9.4,0.70,5
1,9.8,0.88,5
2,9.8,0.76,5
3,9.8,0.28,6
4,9.4,0.70,5
...,...,...,...
6492,11.2,0.21,6
6493,9.6,0.32,5
6494,9.4,0.24,6
6495,12.8,0.29,7


### Choosing between loc and iloc

In [18]:
#iloc indexing scheme: first element of the range is included and the last one excluded. 
wine.iloc[0:3]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1


In [19]:
#Loc indexes inclusively
wine.loc[0:2]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1


# Manipulating the index

In [21]:
#To change the labels in hte index
wine.set_index("quality")

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,red_wine
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1
5,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,1
5,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,1
6,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1
5,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,0
5,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0
6,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,0
7,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,0


# Conditional selection

In [24]:
#to ask specific questions based on conditions. This DataFrame has 6497 rows
wine.pH <=3.5

0       False
1        True
2        True
3        True
4       False
        ...  
6492     True
6493     True
6494     True
6495     True
6496     True
Name: pH, Length: 6497, dtype: bool

In [33]:
#To bring two questions together
wine.loc[(wine.pH<=3.34) & (wine.pH <=3.50)]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
6,7.9,0.60,0.06,1.6,0.069,15.0,59.0,0.99640,3.30,0.46,9.4,5,1
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.99590,3.28,0.54,9.2,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [36]:
#To select one of two questions
wine.loc[(wine.quality>=5) | (wine.alcohol>9.8)]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [None]:
#To bring two questions together
wine.loc[(wine.pH<=3.34) & (wine.alcohol>9.8)]

In [42]:
#isin selector: "is in" a list of values
wine.loc[wine.quality.isin([7])]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
7,7.3,0.65,0.00,1.2,0.065,15.0,21.0,0.99460,3.39,0.47,10.00,7,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.99680,3.36,0.57,9.50,7,1
16,8.5,0.28,0.56,1.8,0.092,35.0,103.0,0.99690,3.30,0.75,10.50,7,1
37,8.1,0.38,0.28,2.1,0.066,13.0,30.0,0.99680,3.23,0.73,9.70,7,1
62,7.5,0.52,0.16,1.9,0.085,12.0,35.0,0.99680,3.38,0.62,9.50,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6469,6.1,0.32,0.28,6.6,0.021,29.0,132.0,0.99188,3.15,0.36,11.45,7,0
6475,6.2,0.38,0.42,2.5,0.038,34.0,117.0,0.99132,3.36,0.59,11.60,7,0
6485,6.2,0.21,0.28,5.7,0.028,45.0,121.0,0.99168,3.21,1.08,12.15,7,0
6486,6.2,0.41,0.22,1.9,0.023,5.0,56.0,0.98928,3.04,0.79,13.00,7,0


In [44]:
#isnull selector: "not null" or are not (NaN)
wine.loc[wine.red_wine.notnull()]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


# Assigning data

In [48]:
#Assigning a constant value
wine['red_wine'] = 'Good_one'
wine['red_wine']

0       Good_one
1       Good_one
2       Good_one
3       Good_one
4       Good_one
          ...   
6492    Good_one
6493    Good_one
6494    Good_one
6495    Good_one
6496    Good_one
Name: red_wine, Length: 6497, dtype: object

In [49]:
#Assigning an iterable value
wine['index_backwards'] = range(len(wine), 0, -1)
wine['index_backwards']

0       6497
1       6496
2       6495
3       6494
4       6493
        ... 
6492       5
6493       4
6494       3
6495       2
6496       1
Name: index_backwards, Length: 6497, dtype: int32