# Pandas

## Operations

In [132]:
# Import pandas
import pandas as pd

In [133]:
# Create a Dictionary to convert into dataframe
data = {'Department': ['Finance', 'Admin', 'HR', 'Development', 'HR', 'Finance'],
        'Person': ['Jack', 'John', 'Amy', 'Jane', 'Sam', 'Harry'],
        'Salary': [20000, 40000, 20000, 50000, 34000, 70000]}

In [134]:
# Create the dataframe from the above dictionary
df = pd.DataFrame(data)

In [135]:
# Display the df
df

Unnamed: 0,Department,Person,Salary
0,Finance,Jack,20000
1,Admin,John,40000
2,HR,Amy,20000
3,Development,Jane,50000
4,HR,Sam,34000
5,Finance,Harry,70000


In [136]:
# display the first five columns from the dataframe
df.head()

Unnamed: 0,Department,Person,Salary
0,Finance,Jack,20000
1,Admin,John,40000
2,HR,Amy,20000
3,Development,Jane,50000
4,HR,Sam,34000


In [137]:
df.tail()

Unnamed: 0,Department,Person,Salary
1,Admin,John,40000
2,HR,Amy,20000
3,Development,Jane,50000
4,HR,Sam,34000
5,Finance,Harry,70000


In [140]:
# Display the description of the database
df.describe()

Unnamed: 0,Salary
count,6.0
mean,39000.0
std,19131.12647
min,20000.0
25%,23500.0
50%,37000.0
75%,47500.0
max,70000.0


In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Department  6 non-null      object
 1   Person      6 non-null      object
 2   Salary      6 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes


Info on Unique Values

In [143]:
# Display the unique values for department
df['Department'].unique()

array(['Finance', 'Admin', 'HR', 'Development'], dtype=object)

In [144]:
# Display the number of unique values for department
df['Department'].nunique()

4

In [146]:
# Display the number of values for each department
df['Department'].value_counts()

Finance        2
HR             2
Admin          1
Development    1
Name: Department, dtype: int64

Selecting data from multiple columns

In [147]:
# Select employee from HR or Finance with salary greater than 20000
df[(((df['Department'] == 'HR') | (df['Department'] == 'Finance')) & (df['Salary'] > 20000))]

Unnamed: 0,Department,Person,Salary
4,HR,Sam,34000
5,Finance,Harry,70000


### Applying Function

If we want to apply function to certain columns than we can apply using the apply() function. We should define the function to be applied beforehand

In [162]:
# Lets create a function to give 10 percent of the salary as bonus to every employee
def bonus(x):
    return 0.1 * x

In [163]:
# Apply the bonus function on the salary column and store the result in new df column called Bonus
df['Bonus'] = df['Salary'].apply(bonus)

In [164]:
# Display the df
df

Unnamed: 0,Department,Person,Salary,Bonus
0,Finance,Jack,20000,2000.0
1,Admin,John,40000,4000.0
2,HR,Amy,20000,2000.0
3,Development,Jane,50000,5000.0
4,HR,Sam,34000,3400.0
5,Finance,Harry,70000,7000.0


In [165]:
# Calculate the total amount being spent on bonus
df['Bonus'].sum()

23400.0

In [167]:
# Calculate the total amount being spent on bonus by each department
df[['Department', 'Bonus']].groupby("Department").sum()

Unnamed: 0_level_0,Bonus
Department,Unnamed: 1_level_1
Admin,4000.0
Development,5000.0
Finance,9000.0
HR,5400.0


Get column and index names

In [168]:
# Display the name of all columns in the dataframe
df.columns

Index(['Department', 'Person', 'Salary', 'Bonus'], dtype='object')

In [169]:
# Display the index of the dataframe
df.index

RangeIndex(start=0, stop=6, step=1)

Sorting and Ordering a DataFrame

In [170]:
# Display the df again
df

Unnamed: 0,Department,Person,Salary,Bonus
0,Finance,Jack,20000,2000.0
1,Admin,John,40000,4000.0
2,HR,Amy,20000,2000.0
3,Development,Jane,50000,5000.0
4,HR,Sam,34000,3400.0
5,Finance,Harry,70000,7000.0


In [171]:
# Sort the values by Person
df.sort_values(by = "Person", inplace = True) # Default inplace = False

In [172]:
# To sort the data by index
# ascending: bool=True by default
# Use sort_index()
df.sort_index(inplace = True)

In [173]:
# Display the df again
df

Unnamed: 0,Department,Person,Salary,Bonus
0,Finance,Jack,20000,2000.0
1,Admin,John,40000,4000.0
2,HR,Amy,20000,2000.0
3,Development,Jane,50000,5000.0
4,HR,Sam,34000,3400.0
5,Finance,Harry,70000,7000.0


## Merging, Joining and Concatenating

In [117]:
# Create left and right df for understanding merging, joining and concatenating
left_df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right_df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})

In [118]:
# Dispaly the left df
left_df

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [119]:
# Display the right df
right_df

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K3,C3,D3


### Concatenation
Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use pd.concat and pass in a list of DataFrames to concatenate together:

In [120]:
# Use concat() to concat two dfs


In [121]:
# use concat() with axis=1 to concat in another dimension


### Merging
The merge function allow us to merge DataFrames together using a similar logic as merging SQL Tables together.

In [122]:
# Display the left df again


In [123]:
# Display the right df again


In [124]:
# Inner Merge left and right df using merge() on key


In [125]:
# Inner Merge left and right df using merge() on key


In [126]:
# Inner Merge left and right df using merge() on key


### Joining

Joining is a convenient method for combining the columns of two potentially differently-indexed DataFrames into a single result DataFrame.

In [127]:
# Create left and right df
left_df = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right_df = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])


In [128]:
# Display the left df


In [129]:
# Display the right df


In [130]:
# Inner Join right with left 


In [131]:
# Outer join right with left
