# Sorting and Ranking 
- organize data during EDA 
- helps find highest/lowest entry
- order data for visualization
- prepare for percentile analysis

In [1]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Age': [25, 30, 35, 40, 29, 32],
    'City': ['Pune', 'Delhi', 'Mumbai', 'Delhi', 'Pune', 'Delhi'],
    'Salary': [50000, 60000, 70000, 80000, 55000, 62000]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
1,Bob,30,Delhi,60000
2,Charlie,35,Mumbai,70000
3,David,40,Delhi,80000
4,Eva,29,Pune,55000
5,Frank,32,Delhi,62000


## Sorting

In [None]:
# by single col desc  - uses numpy's quick sort or merge sort under the hood

df.sort_values(by='Age', ascending=False)


Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
4,Eva,29,Pune,55000
1,Bob,30,Delhi,60000
5,Frank,32,Delhi,62000
2,Charlie,35,Mumbai,70000
3,David,40,Delhi,80000


In [6]:
# in place sort

df.sort_values(by='Salary', ascending=False, inplace=True)
df

Unnamed: 0,Name,Age,City,Salary
3,David,40,Delhi,80000
2,Charlie,35,Mumbai,70000
5,Frank,32,Delhi,62000
1,Bob,30,Delhi,60000
4,Eva,29,Pune,55000
0,Alice,25,Pune,50000


In [None]:
# with missing val - by default Nan placed at last when sorted in asc

import numpy as np
df.loc[2, 'Salary'] = np.nan  # to make value NAN for practice


df_sorted = df.sort_values(by='Salary', na_position='first')  # val NaN placed at first
df_sorted

Unnamed: 0,Name,Age,City,Salary
2,Charlie,35,Mumbai,
0,Alice,25,Pune,50000.0
4,Eva,29,Pune,55000.0
1,Bob,30,Delhi,60000.0
5,Frank,32,Delhi,62000.0
3,David,40,Delhi,80000.0


In [10]:
# by index

df.sort_index()

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000.0
1,Bob,30,Delhi,60000.0
2,Charlie,35,Mumbai,
3,David,40,Delhi,80000.0
4,Eva,29,Pune,55000.0
5,Frank,32,Delhi,62000.0


In [13]:
# by col name

df.sort_index(axis=1)

Unnamed: 0,Age,City,Name,Salary
3,40,Delhi,David,80000.0
2,35,Mumbai,Charlie,
5,32,Delhi,Frank,62000.0
1,30,Delhi,Bob,60000.0
4,29,Pune,Eva,55000.0
0,25,Pune,Alice,50000.0


## Ranking
- assigns rank to val in a Series
- used for generating percentile based feature, determining relative position
- converts series to float and calc ranks for stable sort

In [None]:
# basic - lowest = 1, tie = avg

df['Salary_rank'] = df['Salary'].rank()
df

Unnamed: 0,Name,Age,City,Salary,Salary_rank
3,David,40,Delhi,80000.0,5.0
2,Charlie,35,Mumbai,,
5,Frank,32,Delhi,62000.0,4.0
1,Bob,30,Delhi,60000.0,3.0
4,Eva,29,Pune,55000.0,2.0
0,Alice,25,Pune,50000.0,1.0


In [16]:
# rank in Desc

df['Salary_rank_desc'] = df['Salary'].rank(ascending=False)
df

Unnamed: 0,Name,Age,City,Salary,Salary_rank,Salary_rank_desc
3,David,40,Delhi,80000.0,5.0,1.0
2,Charlie,35,Mumbai,,,
5,Frank,32,Delhi,62000.0,4.0,2.0
1,Bob,30,Delhi,60000.0,3.0,3.0
4,Eva,29,Pune,55000.0,2.0,4.0
0,Alice,25,Pune,50000.0,1.0,5.0


### Handling ties
- ```average``` - default
- ```min``` 
- ```max```
- ```first``` - order of appearance in series
- ```dense``` - like min but ranks are consecutive

In [17]:
df['Salart_rank_min'] = df['Salary'].rank(method='min', ascending=False)
df

Unnamed: 0,Name,Age,City,Salary,Salary_rank,Salary_rank_desc,Salart_rank_min
3,David,40,Delhi,80000.0,5.0,1.0,1.0
2,Charlie,35,Mumbai,,,,
5,Frank,32,Delhi,62000.0,4.0,2.0,2.0
1,Bob,30,Delhi,60000.0,3.0,3.0,3.0
4,Eva,29,Pune,55000.0,2.0,4.0,4.0
0,Alice,25,Pune,50000.0,1.0,5.0,5.0
