In [2]:
# Standard data science libraries
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
plt.style.use('bmh')
# Options for pandas
pd.options.display.max_columns = 20
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


## Exercise 1 ##
- import the NaN constant from `numpy` and create a nickname for it such as 'NA'
- create a `Series` object using a list, using the nickname to include some missing values
- create a new boolean series indicating which values are missing
- use the boolean series to count the number of missing values
- calculate the number of values that are not missing 

In [9]:
from numpy import nan as Missing
s1=pd.Series([2,4,Missing,8,Missing,Missing,Missing,16,18])
s1
s2=s1.isnull()
s2
s2.sum()
s1.notnull().sum()

0     2.0
1     4.0
2     NaN
3     8.0
4     NaN
5     NaN
6     NaN
7    16.0
8    18.0
dtype: float64

0    False
1    False
2     True
3    False
4     True
5     True
6     True
7    False
8    False
dtype: bool

4

5

## Exercise 2 ##
- create a `Series` object using a list
- use the index label to set some elements to missing using `None`
- select only elements that are not missing values
- create a new series that excludes these missing values

In [19]:
s3=pd.Series([1,3,5,7,9,11,13,15,17,19,21])
s3
s3[[2,4,7,8]]=None
s3
s3.dropna()
s3
s4=s3.dropna()
s4

0      1
1      3
2      5
3      7
4      9
5     11
6     13
7     15
8     17
9     19
10    21
dtype: int64

0      1.0
1      3.0
2      NaN
3      7.0
4      NaN
5     11.0
6     13.0
7      NaN
8      NaN
9     19.0
10    21.0
dtype: float64

0      1.0
1      3.0
3      7.0
5     11.0
6     13.0
9     19.0
10    21.0
dtype: float64

0      1.0
1      3.0
2      NaN
3      7.0
4      NaN
5     11.0
6     13.0
7      NaN
8      NaN
9     19.0
10    21.0
dtype: float64

0      1.0
1      3.0
3      7.0
5     11.0
6     13.0
9     19.0
10    21.0
dtype: float64

## Exercise 3 ##
- create a `DataFrame` object, then set some subsets of values in different rows and columns to missing using `.loc[]` and `.iloc[]`
- replace all the missing values in one row with the same value in place
- replace all the missing values in one column with a different value in place
- remove one of the columns with missing values in place using `.loc[]`
- remove one of the columns with missing values in place using a dictionary
- create a new dataframe by replacing() all remaining missing values with the mean value for their column (not in place)

In [62]:
df1=pd.DataFrame({\
                  'x1': range(10),
                  'x2': range(10,0,-1),
                  'x3': range(10,110,10),
                  'x4': range(75,25,-5),
                  'x5': range(2,22,2),
                  'x6': range(60,30,-3),
                  'x7': range(4,44,4),
                  'x8': range(90,80,-1),
                  'x9': range(50,60,1),
                  'x10': range(0,-10,-1)
                 })
df1
df1.loc[2,'x5':]=np.nan
df1.loc[4,:'x4']=np.nan
df1.loc[6:,'x3']=np.nan
df1.iloc[1,1]=np.nan
df1.iloc[8,1]=np.nan
df1
df1.loc[2].fillna(-10,inplace=True)
df1
df1['x4'].fillna(0,inplace=True)
df1
df1.fillna({'x2':99},inplace=True)
df1
df2=df1.fillna(df1.mean())
df2

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,0,10,10,75,2,60,4,90,50,0
1,1,9,20,70,4,57,8,89,51,-1
2,2,8,30,65,6,54,12,88,52,-2
3,3,7,40,60,8,51,16,87,53,-3
4,4,6,50,55,10,48,20,86,54,-4
5,5,5,60,50,12,45,24,85,55,-5
6,6,4,70,45,14,42,28,84,56,-6
7,7,3,80,40,16,39,32,83,57,-7
8,8,2,90,35,18,36,36,82,58,-8
9,9,1,100,30,20,33,40,81,59,-9


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,0.0,10.0,10.0,75.0,2.0,60.0,4.0,90.0,50.0,0.0
1,1.0,,20.0,70.0,4.0,57.0,8.0,89.0,51.0,-1.0
2,2.0,8.0,30.0,65.0,,,,,,
3,3.0,7.0,40.0,60.0,8.0,51.0,16.0,87.0,53.0,-3.0
4,,,,,10.0,48.0,20.0,86.0,54.0,-4.0
5,5.0,5.0,60.0,50.0,12.0,45.0,24.0,85.0,55.0,-5.0
6,6.0,4.0,,45.0,14.0,42.0,28.0,84.0,56.0,-6.0
7,7.0,3.0,,40.0,16.0,39.0,32.0,83.0,57.0,-7.0
8,8.0,,,35.0,18.0,36.0,36.0,82.0,58.0,-8.0
9,9.0,1.0,,30.0,20.0,33.0,40.0,81.0,59.0,-9.0


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,0.0,10.0,10.0,75.0,2.0,60.0,4.0,90.0,50.0,0.0
1,1.0,,20.0,70.0,4.0,57.0,8.0,89.0,51.0,-1.0
2,2.0,8.0,30.0,65.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
3,3.0,7.0,40.0,60.0,8.0,51.0,16.0,87.0,53.0,-3.0
4,,,,,10.0,48.0,20.0,86.0,54.0,-4.0
5,5.0,5.0,60.0,50.0,12.0,45.0,24.0,85.0,55.0,-5.0
6,6.0,4.0,,45.0,14.0,42.0,28.0,84.0,56.0,-6.0
7,7.0,3.0,,40.0,16.0,39.0,32.0,83.0,57.0,-7.0
8,8.0,,,35.0,18.0,36.0,36.0,82.0,58.0,-8.0
9,9.0,1.0,,30.0,20.0,33.0,40.0,81.0,59.0,-9.0


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,0.0,10.0,10.0,75.0,2.0,60.0,4.0,90.0,50.0,0.0
1,1.0,,20.0,70.0,4.0,57.0,8.0,89.0,51.0,-1.0
2,2.0,8.0,30.0,65.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
3,3.0,7.0,40.0,60.0,8.0,51.0,16.0,87.0,53.0,-3.0
4,,,,0.0,10.0,48.0,20.0,86.0,54.0,-4.0
5,5.0,5.0,60.0,50.0,12.0,45.0,24.0,85.0,55.0,-5.0
6,6.0,4.0,,45.0,14.0,42.0,28.0,84.0,56.0,-6.0
7,7.0,3.0,,40.0,16.0,39.0,32.0,83.0,57.0,-7.0
8,8.0,,,35.0,18.0,36.0,36.0,82.0,58.0,-8.0
9,9.0,1.0,,30.0,20.0,33.0,40.0,81.0,59.0,-9.0


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,0.0,10.0,10.0,75.0,2.0,60.0,4.0,90.0,50.0,0.0
1,1.0,99.0,20.0,70.0,4.0,57.0,8.0,89.0,51.0,-1.0
2,2.0,8.0,30.0,65.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
3,3.0,7.0,40.0,60.0,8.0,51.0,16.0,87.0,53.0,-3.0
4,,99.0,,0.0,10.0,48.0,20.0,86.0,54.0,-4.0
5,5.0,5.0,60.0,50.0,12.0,45.0,24.0,85.0,55.0,-5.0
6,6.0,4.0,,45.0,14.0,42.0,28.0,84.0,56.0,-6.0
7,7.0,3.0,,40.0,16.0,39.0,32.0,83.0,57.0,-7.0
8,8.0,99.0,,35.0,18.0,36.0,36.0,82.0,58.0,-8.0
9,9.0,1.0,,30.0,20.0,33.0,40.0,81.0,59.0,-9.0


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,0.0,10.0,10.0,75.0,2.0,60.0,4.0,90.0,50.0,0.0
1,1.0,99.0,20.0,70.0,4.0,57.0,8.0,89.0,51.0,-1.0
2,2.0,8.0,30.0,65.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
3,3.0,7.0,40.0,60.0,8.0,51.0,16.0,87.0,53.0,-3.0
4,4.555556,99.0,32.0,0.0,10.0,48.0,20.0,86.0,54.0,-4.0
5,5.0,5.0,60.0,50.0,12.0,45.0,24.0,85.0,55.0,-5.0
6,6.0,4.0,32.0,45.0,14.0,42.0,28.0,84.0,56.0,-6.0
7,7.0,3.0,32.0,40.0,16.0,39.0,32.0,83.0,57.0,-7.0
8,8.0,99.0,32.0,35.0,18.0,36.0,36.0,82.0,58.0,-8.0
9,9.0,1.0,32.0,30.0,20.0,33.0,40.0,81.0,59.0,-9.0


## Exercise 4 ##
- load data from a file that includes some missing values into a `DataFrame`
- drop any rows with all values missing
- drop columns that have any values missing 

In [74]:
df3=pd.read_csv('data/national-participation-data.csv',
                     na_values={
                         'agency_count_nibrs_submitting':['NA'],
                         'agency_count_leoka_submitting':['NA','.','0',''],
                         'nibrs_population_covered':['']
                     })
df3

df3.loc[37:].fillna(df3.max(),inplace=True)
df3
df3=df3.dropna(how='all')
df3
df3=df3.dropna(how='any',axis=1)
df3

Unnamed: 0,data_year,population,total_agency_count,published_agency_count,active_agency_count,covered_agency_count,population_covered,agency_count_nibrs_submitting,agency_count_leoka_submitting,agency_count_pe_submitting,agency_count_srs_submitting,agency_count_supp_submitting,nibrs_population_covered
0,2018,339044592,18815,16609,22142,24,16441,7610.0,7158.0,16615,9236,15742,37.072662
1,2017,336664398,18636,16536,21960,29,66250,7098.0,6702.0,16538,9397,15744,32.005703
2,2016,333934559,19088,17547,22269,32,112713,7059.0,6730.0,17550,10042,16349,31.335455
3,2015,332627419,19136,17502,22302,21,57357,6804.0,6592.0,17504,10135,16274,29.822275
4,2014,328133558,19311,17439,22256,7,32102,6672.0,6407.0,17441,10064,16095,29.035229
5,2013,325359020,19248,17490,22137,19,28061,6468.0,5826.0,17492,10236,16175,28.74625
6,2012,323754843,18936,17248,21764,18,56701,6330.0,5366.0,17251,10381,16010,28.514172
7,2011,322276014,19078,17298,21828,11,28605,6069.0,5368.0,17301,10519,15874,27.729992
8,2010,318512605,19044,17342,21746,7,14172,5954.0,5156.0,17346,10608,15843,27.044128
9,2009,316298981,18721,16636,21345,40,16269,5903.0,4740.0,16640,10122,15635,26.298717


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Unnamed: 0,data_year,population,total_agency_count,published_agency_count,active_agency_count,covered_agency_count,population_covered,agency_count_nibrs_submitting,agency_count_leoka_submitting,agency_count_pe_submitting,agency_count_srs_submitting,agency_count_supp_submitting,nibrs_population_covered
0,2018,339044592,18815,16609,22142,24,16441,7610.0,7158.0,16615,9236,15742,37.072662
1,2017,336664398,18636,16536,21960,29,66250,7098.0,6702.0,16538,9397,15744,32.005703
2,2016,333934559,19088,17547,22269,32,112713,7059.0,6730.0,17550,10042,16349,31.335455
3,2015,332627419,19136,17502,22302,21,57357,6804.0,6592.0,17504,10135,16274,29.822275
4,2014,328133558,19311,17439,22256,7,32102,6672.0,6407.0,17441,10064,16095,29.035229
5,2013,325359020,19248,17490,22137,19,28061,6468.0,5826.0,17492,10236,16175,28.74625
6,2012,323754843,18936,17248,21764,18,56701,6330.0,5366.0,17251,10381,16010,28.514172
7,2011,322276014,19078,17298,21828,11,28605,6069.0,5368.0,17301,10519,15874,27.729992
8,2010,318512605,19044,17342,21746,7,14172,5954.0,5156.0,17346,10608,15843,27.044128
9,2009,316298981,18721,16636,21345,40,16269,5903.0,4740.0,16640,10122,15635,26.298717


Unnamed: 0,data_year,population,total_agency_count,published_agency_count,active_agency_count,covered_agency_count,population_covered,agency_count_nibrs_submitting,agency_count_leoka_submitting,agency_count_pe_submitting,agency_count_srs_submitting,agency_count_supp_submitting,nibrs_population_covered
0,2018,339044592,18815,16609,22142,24,16441,7610.0,7158.0,16615,9236,15742,37.072662
1,2017,336664398,18636,16536,21960,29,66250,7098.0,6702.0,16538,9397,15744,32.005703
2,2016,333934559,19088,17547,22269,32,112713,7059.0,6730.0,17550,10042,16349,31.335455
3,2015,332627419,19136,17502,22302,21,57357,6804.0,6592.0,17504,10135,16274,29.822275
4,2014,328133558,19311,17439,22256,7,32102,6672.0,6407.0,17441,10064,16095,29.035229
5,2013,325359020,19248,17490,22137,19,28061,6468.0,5826.0,17492,10236,16175,28.74625
6,2012,323754843,18936,17248,21764,18,56701,6330.0,5366.0,17251,10381,16010,28.514172
7,2011,322276014,19078,17298,21828,11,28605,6069.0,5368.0,17301,10519,15874,27.729992
8,2010,318512605,19044,17342,21746,7,14172,5954.0,5156.0,17346,10608,15843,27.044128
9,2009,316298981,18721,16636,21345,40,16269,5903.0,4740.0,16640,10122,15635,26.298717


Unnamed: 0,data_year,population,total_agency_count,published_agency_count,active_agency_count,covered_agency_count,population_covered,agency_count_pe_submitting,agency_count_srs_submitting,agency_count_supp_submitting
0,2018,339044592,18815,16609,22142,24,16441,16615,9236,15742
1,2017,336664398,18636,16536,21960,29,66250,16538,9397,15744
2,2016,333934559,19088,17547,22269,32,112713,17550,10042,16349
3,2015,332627419,19136,17502,22302,21,57357,17504,10135,16274
4,2014,328133558,19311,17439,22256,7,32102,17441,10064,16095
5,2013,325359020,19248,17490,22137,19,28061,17492,10236,16175
6,2012,323754843,18936,17248,21764,18,56701,17251,10381,16010
7,2011,322276014,19078,17298,21828,11,28605,17301,10519,15874
8,2010,318512605,19044,17342,21746,7,14172,17346,10608,15843
9,2009,316298981,18721,16636,21345,40,16269,16640,10122,15635


## Exercise 5 ##
- load data from a file that includes some missing values into a `DataFrame`
- use the `thresh=` parameter to drop records with less than a minimum number of non-missing values
- use a dictionary to replace in the rest of the missing values

In [16]:
df4=pd.read_csv('data/missing-values.csv',
                     na_values=np.nan)
df4
df4=df4.dropna(thresh=4)
df4
df4=df4.fillna({'count':0,'sum':df4['sum'].mean(),'exposure':df4['exposure'].min(),'users':1})
df4

Unnamed: 0,year,month,count,sum,exposure,users
0,2010,Jan,67830.0,621521.0,812240.0,335.0
1,2010,Feb,93099.0,178392.0,,
2,2010,Mar,310739.0,707974.0,396605.0,604.0
3,2010,Apr,116191.0,499533.0,700482.0,888.0
4,2010,May,48907.0,45271.0,546622.0,
5,2010,Jun,243890.0,467405.0,201393.0,703.0
6,2010,Jul,284681.0,431272.0,640114.0,964.0
7,2010,Aug,66161.0,,,
8,2010,Sep,54391.0,748262.0,475010.0,90.0
9,2010,Oct,28566.0,187409.0,409017.0,688.0


Unnamed: 0,year,month,count,sum,exposure,users
0,2010,Jan,67830.0,621521.0,812240.0,335.0
1,2010,Feb,93099.0,178392.0,,
2,2010,Mar,310739.0,707974.0,396605.0,604.0
3,2010,Apr,116191.0,499533.0,700482.0,888.0
4,2010,May,48907.0,45271.0,546622.0,
5,2010,Jun,243890.0,467405.0,201393.0,703.0
6,2010,Jul,284681.0,431272.0,640114.0,964.0
8,2010,Sep,54391.0,748262.0,475010.0,90.0
9,2010,Oct,28566.0,187409.0,409017.0,688.0
10,2010,Nov,406079.0,468649.0,59241.0,365.0


Unnamed: 0,year,month,count,sum,exposure,users
0,2010,Jan,67830.0,621521.0,812240.0,335.0
1,2010,Feb,93099.0,178392.0,59241.0,1.0
2,2010,Mar,310739.0,707974.0,396605.0,604.0
3,2010,Apr,116191.0,499533.0,700482.0,888.0
4,2010,May,48907.0,45271.0,546622.0,1.0
5,2010,Jun,243890.0,467405.0,201393.0,703.0
6,2010,Jul,284681.0,431272.0,640114.0,964.0
8,2010,Sep,54391.0,748262.0,475010.0,90.0
9,2010,Oct,28566.0,187409.0,409017.0,688.0
10,2010,Nov,406079.0,468649.0,59241.0,365.0


## Exercise 6 ##
- create a `Series` object with duplicate values
- create a new boolean series indicating duplicate values
- drop all instances of the duplicate records from the series
- create a `DataFrame` object with duplicate values in in multiple columns
- drop rows that have duplicate values in one the the columns, specifying which record to keep
- check for any remaining duplicates by creating a boolean series
- map one of the columns using a dictionary


In [33]:
s5=pd.Series([5,10,5,15,20,5,10,25,30,35,5])
s5
s5.duplicated()
s5=s5.drop_duplicates()
s5

df5=pd.DataFrame({
                'x1':[0,1,2,3,4,5,6,0,7],
                'x2':[0,0,2,2,4,4,5,0,7],
                'x3':[0,-1,2,-3,4,-5,6,0,7],
                'x4':[0,0,1,0,1,0,1,0,7]
                })
df5
df5=df5.drop_duplicates(['x2'],keep='first')
df5
df5.duplicated()
df5
df5['group']=df5['x4'].map({0:'new business',1:'renewal business',7:'other'})
df5

0      5
1     10
2      5
3     15
4     20
5      5
6     10
7     25
8     30
9     35
10     5
dtype: int64

0     False
1     False
2      True
3     False
4     False
5      True
6      True
7     False
8     False
9     False
10     True
dtype: bool

0     5
1    10
3    15
4    20
7    25
8    30
9    35
dtype: int64

Unnamed: 0,x1,x2,x3,x4
0,0,0,0,0
1,1,0,-1,0
2,2,2,2,1
3,3,2,-3,0
4,4,4,4,1
5,5,4,-5,0
6,6,5,6,1
7,0,0,0,0
8,7,7,7,7


Unnamed: 0,x1,x2,x3,x4
0,0,0,0,0
2,2,2,2,1
4,4,4,4,1
6,6,5,6,1
8,7,7,7,7


0    False
2    False
4    False
6    False
8    False
dtype: bool

Unnamed: 0,x1,x2,x3,x4
0,0,0,0,0
2,2,2,2,1
4,4,4,4,1
6,6,5,6,1
8,7,7,7,7


Unnamed: 0,x1,x2,x3,x4,group
0,0,0,0,0,new business
2,2,2,2,1,renewal business
4,4,4,4,1,renewal business
6,6,5,6,1,renewal business
8,7,7,7,7,other


## Exercise 7 ##
- create a `DataFrame` object and assign a column to be the index
- use a `lambda` function to map the values in one of the columns
- replace values in one column with a a list
- replace values in another column with a dictionary in place
- rename the index labels using `.map()`
- rename columns using `.rename()` and a string function

In [75]:
df6=pd.DataFrame({
    'First Name':['Fred','Wilma','Pebbles','Barney','Betty','Bambam'],
    'Last Name':['Flintstone','Flintstone','Flintstone','Rubble','Rubble','Rubble'],
    'Age':[42,39,15,42,40,16],
    'Gender':['Male','Female','Female','Male','Female','Male'],
    'Employment':['Construction','Sales','NA','Sales','Clerical','NA'],
    'Account Balance':[20000,35000,550,25000,20000,400]
    },
    index=[1,2,3,4,5,6])

df6['hundreds']=df6['Account Balance'].map(lambda x: x // 100)
df6
df6.replace(['Male','Female'],['M','F'])
df6
df6.replace({'Flintstone':'F.','Rubble':'R.','NA':'Unknown','Male':'M','Female':'F'},inplace=True)
df6
df6.index=df6.index.map(lambda x: df6['Last Name'].loc[x] + ', ' + df6['First Name'].loc[x])
df6
df6.rename(columns=str.upper, inplace=True)
df6

Unnamed: 0,First Name,Last Name,Age,Gender,Employment,Account Balance,hundreds
1,Fred,Flintstone,42,Male,Construction,20000,200
2,Wilma,Flintstone,39,Female,Sales,35000,350
3,Pebbles,Flintstone,15,Female,,550,5
4,Barney,Rubble,42,Male,Sales,25000,250
5,Betty,Rubble,40,Female,Clerical,20000,200
6,Bambam,Rubble,16,Male,,400,4


Unnamed: 0,First Name,Last Name,Age,Gender,Employment,Account Balance,hundreds
1,Fred,Flintstone,42,M,Construction,20000,200
2,Wilma,Flintstone,39,F,Sales,35000,350
3,Pebbles,Flintstone,15,F,,550,5
4,Barney,Rubble,42,M,Sales,25000,250
5,Betty,Rubble,40,F,Clerical,20000,200
6,Bambam,Rubble,16,M,,400,4


Unnamed: 0,First Name,Last Name,Age,Gender,Employment,Account Balance,hundreds
1,Fred,Flintstone,42,Male,Construction,20000,200
2,Wilma,Flintstone,39,Female,Sales,35000,350
3,Pebbles,Flintstone,15,Female,,550,5
4,Barney,Rubble,42,Male,Sales,25000,250
5,Betty,Rubble,40,Female,Clerical,20000,200
6,Bambam,Rubble,16,Male,,400,4


Unnamed: 0,First Name,Last Name,Age,Gender,Employment,Account Balance,hundreds
1,Fred,F.,42,M,Construction,20000,200
2,Wilma,F.,39,F,Sales,35000,350
3,Pebbles,F.,15,F,Unknown,550,5
4,Barney,R.,42,M,Sales,25000,250
5,Betty,R.,40,F,Clerical,20000,200
6,Bambam,R.,16,M,Unknown,400,4


Unnamed: 0,First Name,Last Name,Age,Gender,Employment,Account Balance,hundreds
"F., Fred",Fred,F.,42,M,Construction,20000,200
"F., Wilma",Wilma,F.,39,F,Sales,35000,350
"F., Pebbles",Pebbles,F.,15,F,Unknown,550,5
"R., Barney",Barney,R.,42,M,Sales,25000,250
"R., Betty",Betty,R.,40,F,Clerical,20000,200
"R., Bambam",Bambam,R.,16,M,Unknown,400,4


Unnamed: 0,FIRST NAME,LAST NAME,AGE,GENDER,EMPLOYMENT,ACCOUNT BALANCE,HUNDREDS
"F., Fred",Fred,F.,42,M,Construction,20000,200
"F., Wilma",Wilma,F.,39,F,Sales,35000,350
"F., Pebbles",Pebbles,F.,15,F,Unknown,550,5
"R., Barney",Barney,R.,42,M,Sales,25000,250
"R., Betty",Betty,R.,40,F,Clerical,20000,200
"R., Bambam",Bambam,R.,16,M,Unknown,400,4


## Exercise 8 ##
- create a `DataFrame` object with several numeric columns
- rename the columns and/or index labels using `.rename()` and a dictionary
- bin the values in one column into bins with even  ranges of values. Count the number the frequencies of the bins.
- bin the values from another column into bins with even counts and give the bins names. Count the number the frequencies of the bins. Append the bin values as an additional column.
- print the list of bin ranges
- print the list of binned values
- bin the values of a third variable into pre-defined ranges of values that are closed on the left, define the decimal precision, assign labels to the binned values and append to the dataframe.

In [107]:
df7=pd.DataFrame(np.random.rand(120).reshape(20,6))
df7
df7.rename(columns={0:'a',1:'b',2:'c',3:'d',4:'e',5:'f'}, inplace=True)
df7

a_bins=pd.cut(df7['a'],5)
a_bins.value_counts()
df7

b_bins=pd.qcut(df7['b'],5,labels=[1,2,3,4,5])
b_bins.value_counts()
df7

df7['b_bins']=b_bins
df7

b_bins.values.codes
b_bins.values.categories

df7['c_bins']=pd.cut(df7['c'],[0,.25,.5,.75,1],labels=['a','b','c','d'],right=False)
df7

Unnamed: 0,0,1,2,3,4,5
0,0.16124,0.129368,0.528593,0.817791,0.090607,0.968235
1,0.537308,0.644589,0.89554,0.739155,0.866538,0.299461
2,0.459378,0.343149,0.471871,0.954848,0.155495,0.148495
3,0.436575,0.111946,0.485396,0.730607,0.673111,0.87845
4,0.180403,0.78701,0.76229,0.175766,0.28403,0.032554
5,0.235004,0.752427,0.627117,0.620755,0.822563,0.78454
6,0.855922,0.194736,0.516422,0.609232,0.195999,0.763321
7,0.130268,0.59142,0.631075,0.534123,0.907258,0.987663
8,0.504496,0.091581,0.421952,0.079402,0.232388,0.142004
9,0.127416,0.061998,0.706848,0.095037,0.72213,0.086955


Unnamed: 0,a,b,c,d,e,f
0,0.16124,0.129368,0.528593,0.817791,0.090607,0.968235
1,0.537308,0.644589,0.89554,0.739155,0.866538,0.299461
2,0.459378,0.343149,0.471871,0.954848,0.155495,0.148495
3,0.436575,0.111946,0.485396,0.730607,0.673111,0.87845
4,0.180403,0.78701,0.76229,0.175766,0.28403,0.032554
5,0.235004,0.752427,0.627117,0.620755,0.822563,0.78454
6,0.855922,0.194736,0.516422,0.609232,0.195999,0.763321
7,0.130268,0.59142,0.631075,0.534123,0.907258,0.987663
8,0.504496,0.091581,0.421952,0.079402,0.232388,0.142004
9,0.127416,0.061998,0.706848,0.095037,0.72213,0.086955


(0.127, 0.273]    7
(0.419, 0.565]    6
(0.71, 0.856]     4
(0.565, 0.71]     3
(0.273, 0.419]    0
Name: a, dtype: int64

Unnamed: 0,a,b,c,d,e,f
0,0.16124,0.129368,0.528593,0.817791,0.090607,0.968235
1,0.537308,0.644589,0.89554,0.739155,0.866538,0.299461
2,0.459378,0.343149,0.471871,0.954848,0.155495,0.148495
3,0.436575,0.111946,0.485396,0.730607,0.673111,0.87845
4,0.180403,0.78701,0.76229,0.175766,0.28403,0.032554
5,0.235004,0.752427,0.627117,0.620755,0.822563,0.78454
6,0.855922,0.194736,0.516422,0.609232,0.195999,0.763321
7,0.130268,0.59142,0.631075,0.534123,0.907258,0.987663
8,0.504496,0.091581,0.421952,0.079402,0.232388,0.142004
9,0.127416,0.061998,0.706848,0.095037,0.72213,0.086955


5    4
4    4
3    4
2    4
1    4
Name: b, dtype: int64

Unnamed: 0,a,b,c,d,e,f
0,0.16124,0.129368,0.528593,0.817791,0.090607,0.968235
1,0.537308,0.644589,0.89554,0.739155,0.866538,0.299461
2,0.459378,0.343149,0.471871,0.954848,0.155495,0.148495
3,0.436575,0.111946,0.485396,0.730607,0.673111,0.87845
4,0.180403,0.78701,0.76229,0.175766,0.28403,0.032554
5,0.235004,0.752427,0.627117,0.620755,0.822563,0.78454
6,0.855922,0.194736,0.516422,0.609232,0.195999,0.763321
7,0.130268,0.59142,0.631075,0.534123,0.907258,0.987663
8,0.504496,0.091581,0.421952,0.079402,0.232388,0.142004
9,0.127416,0.061998,0.706848,0.095037,0.72213,0.086955


Unnamed: 0,a,b,c,d,e,f,b_bins
0,0.16124,0.129368,0.528593,0.817791,0.090607,0.968235,2
1,0.537308,0.644589,0.89554,0.739155,0.866538,0.299461,4
2,0.459378,0.343149,0.471871,0.954848,0.155495,0.148495,3
3,0.436575,0.111946,0.485396,0.730607,0.673111,0.87845,2
4,0.180403,0.78701,0.76229,0.175766,0.28403,0.032554,5
5,0.235004,0.752427,0.627117,0.620755,0.822563,0.78454,5
6,0.855922,0.194736,0.516422,0.609232,0.195999,0.763321,2
7,0.130268,0.59142,0.631075,0.534123,0.907258,0.987663,3
8,0.504496,0.091581,0.421952,0.079402,0.232388,0.142004,1
9,0.127416,0.061998,0.706848,0.095037,0.72213,0.086955,1


array([1, 3, 2, 1, 4, 4, 1, 2, 0, 0, 3, 0, 4, 3, 3, 2, 0, 2, 4, 1],
      dtype=int8)

Int64Index([1, 2, 3, 4, 5], dtype='int64')

Unnamed: 0,a,b,c,d,e,f,b_bins,c_bins
0,0.16124,0.129368,0.528593,0.817791,0.090607,0.968235,2,c
1,0.537308,0.644589,0.89554,0.739155,0.866538,0.299461,4,d
2,0.459378,0.343149,0.471871,0.954848,0.155495,0.148495,3,b
3,0.436575,0.111946,0.485396,0.730607,0.673111,0.87845,2,b
4,0.180403,0.78701,0.76229,0.175766,0.28403,0.032554,5,d
5,0.235004,0.752427,0.627117,0.620755,0.822563,0.78454,5,c
6,0.855922,0.194736,0.516422,0.609232,0.195999,0.763321,2,c
7,0.130268,0.59142,0.631075,0.534123,0.907258,0.987663,3,c
8,0.504496,0.091581,0.421952,0.079402,0.232388,0.142004,1,b
9,0.127416,0.061998,0.706848,0.095037,0.72213,0.086955,1,c


## Exercise 9 ##
- create a `DataFrame`, include some extreme positive and negative values
- use `.describe()` to identify range of values for each column
- cap values in one column using `np.abs()` and `np.sign()`
- use `np.abs()` and `.any()` to identify rows that  still have any outliers
- drop rows with outliers in any other columns
- take a random sample of rows with replacement

In [152]:
df8=pd.DataFrame(np.random.rand(120).reshape(20,6))
for r,c in list(zip([np.random.randint(20) for n in range(12)],
                    [np.random.randint(6) for n in range(12)])):
    df8.iloc[r,c]=2*(np.random.randint(9)-4)
df8

df8.describe()
col_0=df8[0]

col_0

df8[np.abs(col_0)> 1][0]=np.sign(df8[0])
df8

df8[(np.abs(df8)>1).any(1)]

df8=df8[~((np.abs(df8)>1).any(1))]
df8

df8.sample(n=50,replace=True)

Unnamed: 0,0,1,2,3,4,5
0,0.036106,2.0,0.723104,0.434613,0.853754,0.231158
1,0.656813,0.623728,0.018736,0.241709,-4.0,0.245952
2,0.157287,0.953883,0.12961,0.575121,0.476618,0.026387
3,0.614224,0.536563,0.825329,0.57439,0.08867,0.760072
4,0.6743,0.591059,0.124859,0.906694,0.316095,0.364408
5,0.002099,0.929765,0.520124,0.454142,0.052164,0.624744
6,0.591637,0.693124,0.601278,0.723066,0.847344,0.088823
7,0.569141,0.399574,0.748766,0.110729,0.510841,0.014307
8,0.01749,0.579605,0.681096,0.106189,0.691159,0.659467
9,0.026871,0.946067,0.005146,0.393782,0.561759,0.715087


Unnamed: 0,0,1,2,3,4,5
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.402381,0.62247,1.005569,0.665992,-0.048842,0.114075
std,0.320766,0.439987,1.848646,0.821037,1.742793,1.465286
min,0.0,0.0,0.005146,0.104952,-6.0,-6.0
25%,0.033798,0.342893,0.158088,0.355764,0.183145,0.195575
50%,0.51297,0.605873,0.560701,0.533367,0.498132,0.37919
75%,0.677229,0.825487,0.757636,0.723691,0.67193,0.664873
max,0.864982,2.0,8.0,4.0,0.869883,0.821819


0     0.036106
1     0.656813
2     0.157287
3     0.614224
4     0.674300
5     0.002099
6     0.591637
7     0.569141
8     0.017490
9     0.026871
10    0.686016
11    0.178080
12    0.000000
13    0.456800
14    0.259848
15    0.000000
16    0.811483
17    0.864982
18    0.735166
19    0.709272
Name: 0, dtype: float64

Unnamed: 0,0,1,2,3,4,5
0,0.036106,2.0,0.723104,0.434613,0.853754,0.231158
1,0.656813,0.623728,0.018736,0.241709,-4.0,0.245952
2,0.157287,0.953883,0.12961,0.575121,0.476618,0.026387
3,0.614224,0.536563,0.825329,0.57439,0.08867,0.760072
4,0.6743,0.591059,0.124859,0.906694,0.316095,0.364408
5,0.002099,0.929765,0.520124,0.454142,0.052164,0.624744
6,0.591637,0.693124,0.601278,0.723066,0.847344,0.088823
7,0.569141,0.399574,0.748766,0.110729,0.510841,0.014307
8,0.01749,0.579605,0.681096,0.106189,0.691159,0.659467
9,0.026871,0.946067,0.005146,0.393782,0.561759,0.715087


Unnamed: 0,0,1,2,3,4,5
0,0.036106,2.0,0.723104,0.434613,0.853754,0.231158
1,0.656813,0.623728,0.018736,0.241709,-4.0,0.245952
10,0.686016,0.006248,0.830167,4.0,0.485424,0.821819
16,0.811483,0.223298,4.0,0.725567,0.66552,0.23699
17,0.864982,0.0,0.164736,0.735341,-6.0,-6.0
19,0.709272,0.790728,8.0,0.811483,0.364994,0.645572


Unnamed: 0,0,1,2,3,4,5
2,0.157287,0.953883,0.12961,0.575121,0.476618,0.026387
3,0.614224,0.536563,0.825329,0.57439,0.08867,0.760072
4,0.6743,0.591059,0.124859,0.906694,0.316095,0.364408
5,0.002099,0.929765,0.520124,0.454142,0.052164,0.624744
6,0.591637,0.693124,0.601278,0.723066,0.847344,0.088823
7,0.569141,0.399574,0.748766,0.110729,0.510841,0.014307
8,0.01749,0.579605,0.681096,0.106189,0.691159,0.659467
9,0.026871,0.946067,0.005146,0.393782,0.561759,0.715087
11,0.17808,0.960101,0.645168,0.674295,0.604644,0.681091
12,0.0,0.346639,0.455534,0.492344,0.21428,0.009883


Unnamed: 0,0,1,2,3,4,5
7,0.569141,0.399574,0.748766,0.110729,0.510841,0.014307
14,0.259848,0.209983,0.784244,0.446416,0.089738,0.75564
2,0.157287,0.953883,0.12961,0.575121,0.476618,0.026387
9,0.026871,0.946067,0.005146,0.393782,0.561759,0.715087
5,0.002099,0.929765,0.520124,0.454142,0.052164,0.624744
11,0.17808,0.960101,0.645168,0.674295,0.604644,0.681091
4,0.6743,0.591059,0.124859,0.906694,0.316095,0.364408
14,0.259848,0.209983,0.784244,0.446416,0.089738,0.75564
8,0.01749,0.579605,0.681096,0.106189,0.691159,0.659467
7,0.569141,0.399574,0.748766,0.110729,0.510841,0.014307


## Exercise 10 ##
- create a `Series` object
- create a new series consisting of the binned values of the series
- create another series by taking a random subset of binned values using `np.random.permutation()`
- add columns containing indicator variables for all of the bin labels, specify a prefix

In [167]:
s5=pd.Series(np.random.rand(50)) * 100
s5
s6=pd.qcut(s5,10,labels=['a','b','c','d','e','f','g','h','i','j'])
s6
s7=s6.take(np.random.permutation(10))
s7
df9=pd.DataFrame(s7).join(pd.get_dummies(s7,prefix='b_'))
df9

0     78.231495
1     40.222946
2     98.059621
3     84.321113
4     87.700845
5     19.401902
6     97.328099
7     88.434203
8     73.358403
9     92.557213
10    69.656544
11    32.296437
12    84.787804
13    17.714413
14    26.326527
15    74.720955
16    70.526220
17    39.557883
18    38.796368
19    45.395153
20    50.950457
21    90.155014
22    94.962050
23    64.143512
24    71.321082
25     9.278708
26    53.387982
27     3.938554
28    68.901278
29     7.627533
30    55.555251
31    98.319289
32    88.523488
33    76.947970
34    56.954169
35    80.950835
36    24.212343
37    87.930624
38     6.828811
39    48.425077
40    62.909212
41    11.913968
42    44.937357
43    28.320496
44    11.964418
45     3.491488
46    86.979840
47     7.526268
48    50.540547
49    71.602285
dtype: float64

0     h
1     d
2     j
3     h
4     i
5     b
6     j
7     i
8     g
9     j
10    f
11    c
12    h
13    b
14    c
15    g
16    f
17    d
18    c
19    d
20    e
21    i
22    j
23    f
24    g
25    b
26    e
27    a
28    f
29    a
30    e
31    j
32    i
33    g
34    e
35    h
36    c
37    i
38    a
39    d
40    f
41    b
42    d
43    c
44    b
45    a
46    h
47    a
48    e
49    g
dtype: category
Categories (10, object): [a < b < c < d ... g < h < i < j]

9    j
5    b
3    h
6    j
0    h
4    i
1    d
2    j
7    i
8    g
dtype: category
Categories (10, object): [a < b < c < d ... g < h < i < j]

Unnamed: 0,0,b__a,b__b,b__c,b__d,b__e,b__f,b__g,b__h,b__i,b__j
9,j,0,0,0,0,0,0,0,0,0,1
5,b,0,1,0,0,0,0,0,0,0,0
3,h,0,0,0,0,0,0,0,1,0,0
6,j,0,0,0,0,0,0,0,0,0,1
0,h,0,0,0,0,0,0,0,1,0,0
4,i,0,0,0,0,0,0,0,0,1,0
1,d,0,0,0,1,0,0,0,0,0,0
2,j,0,0,0,0,0,0,0,0,0,1
7,i,0,0,0,0,0,0,0,0,1,0
8,g,0,0,0,0,0,0,1,0,0,0


## Exercise 11 ##
- create a `DataFrame` with several columns of sting values and some missing values
- use vectorized string functions to convert the case of one of the columns. Use another one to concatenate two columns
- create a boolean array indicating the presence of a substing in one of the columns
- create a regex pattern and use it to extract a substring from one of the columns into a new column

In [186]:
df10=pd.DataFrame({
    'FirstName':['Tim','Carol','Frank','Angela','Susan'],
    'LastName':['McKinney','Liu','West','LaTouret','Anderson'],
    'UserName':['tmc1965','cliu5','frankwest','ALT2000','trekkie4ever'],
    'password':['OU812','password','1234','HdU4j%s90v',''],
    'UserNumber':[24,82,np.nan,173,np.nan]
})
df10
df10['FirstName']=df10['FirstName'].str.upper()
df10['LastName']=df10['LastName'].str.upper()
df10

df10['password'].str.lower().str.contains('password')

import re

pattern=r'([0-9])'
df10['digits']=df10['UserName'].str.extract(pattern)
df10

Unnamed: 0,FirstName,LastName,UserName,password,UserNumber
0,Tim,McKinney,tmc1965,OU812,24.0
1,Carol,Liu,cliu5,password,82.0
2,Frank,West,frankwest,1234,
3,Angela,LaTouret,ALT2000,HdU4j%s90v,173.0
4,Susan,Anderson,trekkie4ever,,


Unnamed: 0,FirstName,LastName,UserName,password,UserNumber
0,TIM,MCKINNEY,tmc1965,OU812,24.0
1,CAROL,LIU,cliu5,password,82.0
2,FRANK,WEST,frankwest,1234,
3,ANGELA,LATOURET,ALT2000,HdU4j%s90v,173.0
4,SUSAN,ANDERSON,trekkie4ever,,


0    False
1     True
2    False
3    False
4    False
Name: password, dtype: bool

Unnamed: 0,FirstName,LastName,UserName,password,UserNumber,digits
0,TIM,MCKINNEY,tmc1965,OU812,24.0,1.0
1,CAROL,LIU,cliu5,password,82.0,5.0
2,FRANK,WEST,frankwest,1234,,
3,ANGELA,LATOURET,ALT2000,HdU4j%s90v,173.0,2.0
4,SUSAN,ANDERSON,trekkie4ever,,,4.0
