# Operations

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:

In [33]:
import pandas as pd
df = pd.read_csv('1_Sales.csv')
df.head(5)

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
1,Amy,North,Feb,24131.0,79.0
2,Amy,North,Mar,24646.0,71.0
3,Amy,North,Apr,22047.0,71.0
4,Amy,North,May,24971.0,


In [2]:
df.describe()

Unnamed: 0,Sales,Units Sold
count,47.0,47.0
mean,24209.425532,206.617021
std,2671.207948,204.332269
min,19625.0,68.0
25%,22348.5,86.5
50%,24218.0,114.0
75%,25742.0,227.0
max,29953.0,852.0


### Info on Unique Values

In [3]:
df['SalesRep'].unique()

array(['Amy', 'Bob', 'Chuck', 'Doug'], dtype=object)

In [4]:
df['SalesRep'].nunique()

4

In [5]:
df['SalesRep'].value_counts()

Bob      12
Doug     12
Amy      12
Chuck    12
Name: SalesRep, dtype: int64

### Selecting Data

In [6]:
#Select from DataFrame using criteria from multiple columns
seldf = df[(df['Units Sold']>100) & (df['Month']=='Jan')]

In [7]:
seldf

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
12,Bob,North,Jan,20024.0,103.0


### Applying Functions

In [8]:
def times2(x):
    return x*2

In [9]:
df['Units Sold'].apply(times2).head()

0    478.0
1    158.0
2    142.0
3    142.0
4      NaN
Name: Units Sold, dtype: float64

In [10]:
df['SalesRep'].apply(len).head()

0    3
1    3
2    3
3    3
4    3
Name: SalesRep, dtype: int64

In [11]:
def compute_AUP(cols):
    x = cols[0]
    y = cols[1]
    return cols[0] / cols[1]

In [12]:
df['AUP'] = df[['Sales','Units Sold']].apply(compute_AUP, axis = 1) # use axis = 1; to apply to each row
df.head()

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold,AUP
0,Amy,North,Jan,23040.0,239.0,96.401674
1,Amy,North,Feb,24131.0,79.0,305.455696
2,Amy,North,Mar,24646.0,71.0,347.126761
3,Amy,North,Apr,22047.0,71.0,310.521127
4,Amy,North,May,24971.0,,


In [13]:
df['Sales'].sum()

1137843.0

** Permanently Removing a Column**

In [14]:
del df['AUP']

In [15]:
df.head()

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
1,Amy,North,Feb,24131.0,79.0
2,Amy,North,Mar,24646.0,71.0
3,Amy,North,Apr,22047.0,71.0
4,Amy,North,May,24971.0,


** Get column and index names: **

In [16]:
df.columns

Index(['SalesRep', 'Region', 'Month', 'Sales', 'Units Sold'], dtype='object')

In [17]:
df.index

RangeIndex(start=0, stop=48, step=1)

** Sorting and Ordering a DataFrame:**

In [18]:
df

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
1,Amy,North,Feb,24131.0,79.0
2,Amy,North,Mar,24646.0,71.0
3,Amy,North,Apr,22047.0,71.0
4,Amy,North,May,24971.0,
5,Amy,North,Jun,24218.0,92.0
6,Amy,North,Jul,25735.0,175.0
7,Amy,North,Aug,,87.0
8,Amy,North,Sep,25749.0,557.0
9,Amy,North,Oct,24437.0,95.0


In [19]:
df.sort_values(by='Sales') #inplace=False by default

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
33,Chuck,South,Oct,19625.0,83.0
34,Chuck,South,Nov,19832.0,70.0
24,Chuck,South,Jan,19886.0,95.0
12,Bob,North,Jan,20024.0,103.0
28,Chuck,South,May,20280.0,453.0
35,Chuck,South,Dec,20583.0,178.0
18,Bob,North,Jul,21184.0,68.0
31,Chuck,South,Aug,21273.0,769.0
32,Chuck,South,Sep,21584.0,114.0
26,Chuck,South,Mar,21824.0,83.0


** Find Null Values or Check for Null Values**

In [20]:
df.isnull()

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,True
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,True,False
8,False,False,False,False,False
9,False,False,False,False,False


In [21]:
# Drop rows with NaN Values
df.dropna()

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
1,Amy,North,Feb,24131.0,79.0
2,Amy,North,Mar,24646.0,71.0
3,Amy,North,Apr,22047.0,71.0
5,Amy,North,Jun,24218.0,92.0
6,Amy,North,Jul,25735.0,175.0
8,Amy,North,Sep,25749.0,557.0
9,Amy,North,Oct,24437.0,95.0
10,Amy,North,Nov,25355.0,706.0
11,Amy,North,Dec,25899.0,180.0


** Filling in NaN values with something else: **

In [22]:
import numpy as np

In [28]:
df = pd.read_csv('1_Sales.csv')
df.head(10)

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
1,Amy,North,Feb,24131.0,79.0
2,Amy,North,Mar,24646.0,71.0
3,Amy,North,Apr,22047.0,71.0
4,Amy,North,May,24971.0,
5,Amy,North,Jun,24218.0,92.0
6,Amy,North,Jul,25735.0,175.0
7,Amy,North,Aug,,87.0
8,Amy,North,Sep,25749.0,557.0
9,Amy,North,Oct,24437.0,95.0


In [29]:
df['Sales'] = df['Sales'].fillna(df['Sales'].mean())

In [30]:
df.head(10)

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
1,Amy,North,Feb,24131.0,79.0
2,Amy,North,Mar,24646.0,71.0
3,Amy,North,Apr,22047.0,71.0
4,Amy,North,May,24971.0,
5,Amy,North,Jun,24218.0,92.0
6,Amy,North,Jul,25735.0,175.0
7,Amy,North,Aug,24209.425532,87.0
8,Amy,North,Sep,25749.0,557.0
9,Amy,North,Oct,24437.0,95.0


## Pivot

In [34]:
df.head()

Unnamed: 0,SalesRep,Region,Month,Sales,Units Sold
0,Amy,North,Jan,23040.0,239.0
1,Amy,North,Feb,24131.0,79.0
2,Amy,North,Mar,24646.0,71.0
3,Amy,North,Apr,22047.0,71.0
4,Amy,North,May,24971.0,


In [35]:
df.pivot_table(values='Sales',index=['Month'],columns=['SalesRep'])

SalesRep,Amy,Bob,Chuck,Doug
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Apr,22047.0,22838.0,22058.0,29338.0
Aug,,23174.0,21273.0,29506.0
Dec,25899.0,23179.0,20583.0,28670.0
Feb,24131.0,23822.0,23494.0,29953.0
Jan,23040.0,20024.0,19886.0,26264.0
Jul,25735.0,21184.0,23032.0,25044.0
Jun,24218.0,24733.0,23965.0,27371.0
Mar,24646.0,24854.0,21824.0,25041.0
May,24971.0,25320.0,20280.0,25150.0
Nov,25355.0,23949.0,19832.0,25953.0
