In [4]:
import pandas as pd

### Setting the Index Column Initially When Creating a New DataFrame & CSV to DataFrame

In [5]:
# keys (column labels)
# values (data)
d = {
	'country': ['Brazil', 'Russia', 'India', 'China', 'South Africa'],
	'capital': ['Brasilia', 'Moscow', 'New Delhi', 'Beijing', 'Pretoria']
}
brics = pd.DataFrame(d)
print(brics)

brics.index = ['BR', 'RU', 'IN', 'CH', 'SA']
print(brics)

# pd.read_csv() => returns DataFrame 
brics = pd.read_csv("https://github.com/MohamedMostafa259/Pandas-Notes/blob/main/Data/brics.csv", index_col=0) # OR: index_col=["col0_name"]
brics

        country    capital
0        Brazil   Brasilia
1        Russia     Moscow
2         India  New Delhi
3         China    Beijing
4  South Africa   Pretoria
         country    capital
BR        Brazil   Brasilia
RU        Russia     Moscow
IN         India  New Delhi
CH         China    Beijing
SA  South Africa   Pretoria


HTTPError: HTTP Error 404: Not Found

# Copying DataFrames
Sometimes we want to copy a DataFrame to another object that we will prepare and clean it in order to use it for making a machine learning model

In [3]:
# Create a copy of `brics` for transforming, cleaning, and other purposes. 
# Notice that if we write `brics_copy = brics`, they will point to the same DataFrame in the memory.
	# So, changes in `brics_copy` would be applied on `brics` as well, but this might not be intended!
brics_copy = brics.copy()

# Accessing in Pandas

- Use `[]`, `.loc[]`, and `.iloc[]` for general access.

- Use `.at[]` and `.iat[]` when performance is critical, and you need to access only one element.

- Use `.get()` when you need a safe access method that won't raise an error for missing labels.

# Subsetting and Slicing (PART 1)

In [4]:
# Column Access using []
print(type(brics['country'])) # a Series is like a one-dimensional array that can be labeled (DataFrame is a collection of Series)
brics['country']

<class 'pandas.core.series.Series'>


BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object

In [5]:
# select column(s), but make it DataFrame, not Series
print(type(brics[['country']])) # DataFrame
# sub-DataFrame
print(brics[['country']])
brics[['country', 'capital']]

<class 'pandas.core.frame.DataFrame'>
         country
BR        Brazil
RU        Russia
IN         India
CH         China
SA  South Africa


Unnamed: 0,country,capital
BR,Brazil,Brasilia
RU,Russia,Moscow
IN,India,New Delhi
CH,China,Beijing
SA,South Africa,Pretoria


In [6]:
# Row Access using [] (slicing)
brics[1:4]

Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0


### Discussion []
Square brackets [] have limited functionalities<br>
We need something more intelligent:
- loc (label-based)
- iloc (integer position-based) <br>
**Advantages over [] access:** 
	- You can select rows based on their labels and indices (in [] access, you cannot select a certain row name (label))
	- You can change labels' order (look at the example below)
	- Their rows don't have to be consecutive (look at the example below)
	- You can select a specific cell 

In [7]:
# Row Access using loc
print(brics.loc['RU'], '\n') # Series
print(brics.loc[['RU']], '\n') # DataFrame
brics.loc[['RU', 'IN','CH']] # sub-DataFrame , = brics.loc[['RU', 'IN', 'CH'], :]
# you can change the above order, like brics.loc[['IN', 'CH','RU']]
# Labels' rows don't have to be consecutive, for example, brics.loc[['BR', 'IN','SA']]
# You can select rows and columns at the same time: brics.loc[['RU', 'IN','CH'], ['country', 'capital']] (Explained below ↓↓) 

country       Russia
capital       Moscow
area            17.1
population     143.5
Name: RU, dtype: object 

   country capital  area  population
RU  Russia  Moscow  17.1       143.5 



Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0


In [8]:
# Column Access using loc
print(brics.loc[:, ['country', 'capital']]) # DataFrame , brics.loc[:, 'country'] => Series

# Row & Column Access using loc (very similar to accessing Numpy arrays => DataFrame.loc[[rowsLabels], [colsLabels]])
brics.loc[['RU', 'CH', 'IN'], ['country', 'capital']]

         country    capital
BR        Brazil   Brasilia
RU        Russia     Moscow
IN         India  New Delhi
CH         China    Beijing
SA  South Africa   Pretoria


Unnamed: 0,country,capital
RU,Russia,Moscow
CH,China,Beijing
IN,India,New Delhi


In [9]:
# Row & Column Access using iloc (very similar to accessing using loc, but here we use positions)
print(brics.iloc[:, [0,1]], '\n')
print(brics.iloc[1:4, [0,2]], '\n') # use slicing if you want to select consecutive items and lists when you want specific elements
brics.iloc[:3, :2] # first 3 rows and first 2 columns

         country    capital
BR        Brazil   Brasilia
RU        Russia     Moscow
IN         India  New Delhi
CH         China    Beijing
SA  South Africa   Pretoria 

   country    area
RU  Russia  17.100
IN   India   3.286
CH   China   9.597 



Unnamed: 0,country,capital
BR,Brazil,Brasilia
RU,Russia,Moscow
IN,India,New Delhi


## Accessing DataFrames with MultiIndex using `loc`, `iloc`, and `xs`

In [10]:
index = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 1), ('B', 2)])
df = pd.DataFrame({'value': [10, 20, 30, 40]}, index=index)
df

Unnamed: 0,Unnamed: 1,value
A,1,10
A,2,20
B,1,30
B,2,40


In [11]:
# Access rows where the first level of the MultiIndex is 'A'
print(df.loc['A'])
print(df.xs('A', level=0), '\n') 

# Access rows where the second level of the MultiIndex is 2
print(df.xs(2, level=1), '\n') 

# Access row where the first level is 'A' and the second level is 1
print(df.loc[('A', 1)])
print(df.xs(('A', 1)), '\n')

# Get all values from the first level of the MultiIndex
print(df.index.get_level_values(0))

   value
1     10
2     20
   value
1     10
2     20 

   value
A     20
B     40 

value    10
Name: (A, 1), dtype: int64
value    10
Name: (A, 1), dtype: int64 

Index(['A', 'A', 'B', 'B'], dtype='object')
