### 0. Introduction

In [None]:
# This is a basic practice in Pandas which includes creating DataFrames (using different type of variables, assigning index parameters to row labels, and creating series or lists from a DataFrame); reading data files (reading CSV files from existing DataFrames, checking size of DataFrame, and examining contents of DataFrame); and native accessors (accessing columns' properties)

### 1. Getting started

In [2]:
#Import package
import pandas as pd

### 2. Creating data

In [15]:
#Creating a DataFrame with integers (array/table with individual entries for rows and columns)
df = pd.DataFrame({'Yes': [50, 21, 10, 15, 10, 80], 'No': [131, 21, 14, 26, 10, 35]})

In [30]:
# To return DataFrame
df

Unnamed: 0,Yes,No
0,50,131
1,21,21
2,10,14
3,15,26
4,10,10
5,80,35


In [14]:
# To create a DataFrame with strings as values
water_quality = pd.DataFrame({'pH': ['Alkaline', 'Acid', 'Neutral'], 'Oxygen Saturation': ['Poor', 'Moderate', "Well Oxygenated"], 'Conductivity' : ['Brackish', 'Fresh', "Saline"]})
water_quality

Unnamed: 0,pH,Oxygen Saturation,Conductivity
0,Alkaline,Poor,Brackish
1,Acid,Moderate,Fresh
2,Neutral,Well Oxygenated,Saline


In [39]:
# To assign index parameter to row labels
water_quality = pd.DataFrame({'pH': ['Alkaline', 'Acid', 'Neutral'], 'Oxygen Saturation': ['Poor', 'Moderate', "Well Oxygenated"], 'Conductivity' : ['Brackish', 'Fresh', "Saline"]}, index=['Site 1', 'Site 2', 'Site 3'])
water_quality

Unnamed: 0,pH,Oxygen Saturation,Conductivity
Site 1,Alkaline,Poor,Brackish
Site 2,Acid,Moderate,Fresh
Site 3,Neutral,Well Oxygenated,Saline


In [43]:
# Creating series or list (sequence of data values, a single column of a DataFrame)
series = df['Yes']
series

0    50
1    21
2    10
3    15
4    10
5    80
Name: Yes, dtype: int64

In [44]:
series = water_quality['pH']
series

Site 1    Alkaline
Site 2        Acid
Site 3     Neutral
Name: pH, dtype: object

In [46]:
pd.Series([45, 85, 120], index=['DO 2018', 'DO 2019', 'DO 2020'], name='Site 1')

DO 2018     45
DO 2019     85
DO 2020    120
Name: Site 1, dtype: int64

### 3. Reading data files

In [66]:
#Reading CSV files on existing DataFrames
df = pd.read_csv("winequality_merged.csv")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [68]:
#To check how large de DataFrame is
df.shape

(6497, 13)

In [69]:
#To examine the contents of the DataFrame using head() function to grab the first five rows
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


### 4. Native accessors 

In [72]:
# Accessing columns' properties on a DataFrame 
df.chlorides

0       0.076
1       0.098
2       0.092
3       0.075
4       0.076
        ...  
6492    0.039
6493    0.047
6494    0.041
6495    0.022
6496    0.020
Name: chlorides, Length: 6497, dtype: float64

In [1]:
#In this case the access cannot work because the column name has reserved characters in them (what is a reserve character?)
df.total sulfur dioxide

SyntaxError: invalid syntax (<ipython-input-1-8db7d8c0de40>, line 2)

In [75]:
#Accessing values in columns using indexing operator [] when having column names with reserved characters
df["free sulfur dioxide"]

0       11.0
1       25.0
2       15.0
3       17.0
4       11.0
        ... 
6492    24.0
6493    57.0
6494    30.0
6495    20.0
6496    22.0
Name: free sulfur dioxide, Length: 6497, dtype: float64