# Preprocessing and Aggregating data
- mainly Time series data
- sorting data

### Sorting the dataframe

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({
    'roll_no' : [102,101,104,103,105],
    'name' : ['Aravind','Rahul','Praveen','Priya','Rahul'],
    'grade' : ['A','B','C','A','C'],
    'marks' : [22,15,10,23,9]
})

In [3]:
df

Unnamed: 0,roll_no,name,grade,marks
0,102,Aravind,A,22
1,101,Rahul,B,15
2,104,Praveen,C,10
3,103,Priya,A,23
4,105,Rahul,C,9


### sort the df by grades

In [4]:
df.sort_values(by = ['grade'])

Unnamed: 0,roll_no,name,grade,marks
0,102,Aravind,A,22
3,103,Priya,A,23
1,101,Rahul,B,15
2,104,Praveen,C,10
4,105,Rahul,C,9


In [5]:
df.sort_values(by = ['roll_no'])

Unnamed: 0,roll_no,name,grade,marks
1,101,Rahul,B,15
0,102,Aravind,A,22
3,103,Priya,A,23
2,104,Praveen,C,10
4,105,Rahul,C,9


##### now i need grades from 'A' to 'c' and marks highest to lowest

In [6]:
df.sort_values(by = ['grade', 'marks'], ascending=[True, False])

Unnamed: 0,roll_no,name,grade,marks
3,103,Priya,A,23
0,102,Aravind,A,22
1,101,Rahul,B,15
2,104,Praveen,C,10
4,105,Rahul,C,9


In [7]:
df

Unnamed: 0,roll_no,name,grade,marks
0,102,Aravind,A,22
1,101,Rahul,B,15
2,104,Praveen,C,10
3,103,Priya,A,23
4,105,Rahul,C,9


#### Note
- even though we have sorted the df based on our criteria the original df is not changed.... that is where inplace comes into action,

In [8]:
df.sort_values(by = ['grade', 'marks'], ascending=[True, False], inplace = True)

In [9]:
df

Unnamed: 0,roll_no,name,grade,marks
3,103,Priya,A,23
0,102,Aravind,A,22
1,101,Rahul,B,15
2,104,Praveen,C,10
4,105,Rahul,C,9


In [10]:
df.reset_index()

Unnamed: 0,index,roll_no,name,grade,marks
0,3,103,Priya,A,23
1,0,102,Aravind,A,22
2,1,101,Rahul,B,15
3,2,104,Praveen,C,10
4,4,105,Rahul,C,9


In [11]:
# we can see 2 index so we have to handle this

df.reset_index(inplace = True, drop = True)

In [12]:
df

Unnamed: 0,roll_no,name,grade,marks
0,103,Priya,A,23
1,102,Aravind,A,22
2,101,Rahul,B,15
3,104,Praveen,C,10
4,105,Rahul,C,9


## Working with multiple dataframe

In [18]:
df1 = pd.DataFrame({
    'roll_no' : [106,108,110,107,109],
    'name' : ['Aravindhan','Rahul Shing','Prakash','Pragya','Rajesh'],
    'grade' : ['A','B','C','A','C'],
    'marks' : [22,15,10,23,9]
})

In [19]:
df2 = pd.DataFrame({
    'roll_no' : [111,114,112,115,113],
    'name' : ['Ashwin','Girish','Jenish','Arun','Naren'],
    'grade' : ['A','B','C','A','C'],
    'marks' : [22,15,10,23,9]
})

In [20]:
df.shape, df1.shape, df2.shape

((5, 4), (5, 4), (5, 4))

In [21]:
df2.head()

Unnamed: 0,roll_no,name,grade,marks
0,111,Ashwin,A,22
1,114,Girish,B,15
2,112,Jenish,C,10
3,115,Arun,A,23
4,113,Naren,C,9


In [26]:
all_df = [df,df1,df2]

all_dataframe = pd.concat(all_df,axis = 0)

In [27]:
all_dataframe

Unnamed: 0,roll_no,name,grade,marks
0,103,Priya,A,23
1,102,Aravind,A,22
2,101,Rahul,B,15
3,104,Praveen,C,10
4,105,Rahul,C,9
0,106,Aravindhan,A,22
1,108,Rahul Shing,B,15
2,110,Prakash,C,10
3,107,Pragya,A,23
4,109,Rajesh,C,9


In [28]:
all_dataframe.reset_index(inplace = True, drop = True)

In [29]:
all_dataframe

Unnamed: 0,roll_no,name,grade,marks
0,103,Priya,A,23
1,102,Aravind,A,22
2,101,Rahul,B,15
3,104,Praveen,C,10
4,105,Rahul,C,9
5,106,Aravindhan,A,22
6,108,Rahul Shing,B,15
7,110,Prakash,C,10
8,107,Pragya,A,23
9,109,Rajesh,C,9


In [30]:
all_dataframe.shape

(15, 4)

## Adding another row to dataframe

In [31]:
phone_no = pd.DataFrame({'phone_no':[123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,]})

In [32]:
phone_no

Unnamed: 0,phone_no
0,123456789
1,123456789
2,123456789
3,123456789
4,123456789
5,123456789
6,123456789
7,123456789
8,123456789
9,123456789


In [33]:
all_dataframe = pd.concat([all_dataframe, phone_no], axis = 1)

In [34]:
all_dataframe

Unnamed: 0,roll_no,name,grade,marks,phone_no
0,103,Priya,A,23,123456789
1,102,Aravind,A,22,123456789
2,101,Rahul,B,15,123456789
3,104,Praveen,C,10,123456789
4,105,Rahul,C,9,123456789
5,106,Aravindhan,A,22,123456789
6,108,Rahul Shing,B,15,123456789
7,110,Prakash,C,10,123456789
8,107,Pragya,A,23,123456789
9,109,Rajesh,C,9,123456789


In [35]:
all_dataframe.isna().sum()

roll_no     0
name        0
grade       0
marks       0
phone_no    0
dtype: int64