# Lab | Revisiting Machine Learning Case Study

In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('learningSet.csv')

In [4]:
data.columns = [column.lower().replace(' ', '_') for column in data.columns]

In [19]:
categorical = data.select_dtypes(exclude=[np.number])

In [20]:
numeric = data.select_dtypes(include=[np.number])

In this lab, you will use learningSet.csv file which you already have cloned in today's activities.

### 1. Check for null values in all the columns

In [5]:
data.isna().sum()

odatedw       0
osource       0
tcode         0
state         0
zip           0
           ... 
mdmaud_r      0
mdmaud_f      0
mdmaud_a      0
cluster2    132
geocode2    132
Length: 481, dtype: int64

### 2. Exclude the following variables by looking at the definitions. Create a new empty list called drop_list. We will append this list and then drop all the columns in this list later:

- `OSOURCE` - symbol definitions not provided, too many categories
- `ZIP CODE` - we are including state already


In [6]:
drop_list = []

In [7]:
drop_list = ['osource', 'zip_code']

### 3. Identify columns that over 85% missing values

In [9]:
nulls_percent_data = (data.isna().sum().sort_values(ascending=False)*100/len(data)).reset_index

In [11]:
(data.isna().sum().sort_values(ascending=False)*100/len(data))[:30]

rdate_5     99.990567
ramnt_5     99.990567
rdate_3     99.746363
ramnt_3     99.746363
rdate_4     99.705488
ramnt_4     99.705488
ramnt_6     99.186685
rdate_6     99.186685
ramnt_15    92.388798
rdate_15    92.388798
rdate_23    91.763091
ramnt_23    91.763091
rdate_20    91.732696
ramnt_20    91.732696
ramnt_7     90.677273
rdate_7     90.677273
ramnt_17    90.146942
rdate_17    90.146942
rdate_21    90.029556
ramnt_21    90.029556
ramnt_10    89.035970
rdate_10    89.035970
rdate_13    87.160944
ramnt_13    87.160944
numchld     87.018404
rdate_11    84.551209
ramnt_11    84.551209
rdate_19    83.359535
ramnt_19    83.359535
ramnt_9     82.461326
dtype: float64

In [12]:
nulls = pd.DataFrame(data.isna().sum()*100/len(data), columns=['percentage'])
nulls.sort_values('percentage', ascending = False).head(80)

Unnamed: 0,percentage
rdate_5,99.990567
ramnt_5,99.990567
rdate_3,99.746363
ramnt_3,99.746363
rdate_4,99.705488
...,...
adate_16,21.343227
adate_14,19.774242
adate_9,11.785729
adate_11,10.923154


### 4. Remove those columns from the dataframe

In [15]:
cols_to_keep = list(nulls[nulls['percentage'] < 15].index)
cols_to_keep

['odatedw',
 'osource',
 'tcode',
 'state',
 'zip',
 'mailcode',
 'pvastate',
 'dob',
 'noexch',
 'recinhse',
 'recp3',
 'recpgvg',
 'recsweep',
 'mdmaud',
 'domain',
 'cluster',
 'ageflag',
 'homeownr',
 'child03',
 'child07',
 'child12',
 'child18',
 'gender',
 'hit',
 'datasrce',
 'malemili',
 'malevet',
 'vietvets',
 'wwiivets',
 'localgov',
 'stategov',
 'fedgov',
 'solp3',
 'solih',
 'major',
 'geocode',
 'collect1',
 'veterans',
 'bible',
 'catlg',
 'homee',
 'pets',
 'cdplay',
 'stereo',
 'pcowners',
 'photo',
 'crafts',
 'fisher',
 'gardenin',
 'boats',
 'walker',
 'kidstuff',
 'cards',
 'plates',
 'lifesrc',
 'pepstrfl',
 'pop901',
 'pop902',
 'pop903',
 'pop90c1',
 'pop90c2',
 'pop90c3',
 'pop90c4',
 'pop90c5',
 'eth1',
 'eth2',
 'eth3',
 'eth4',
 'eth5',
 'eth6',
 'eth7',
 'eth8',
 'eth9',
 'eth10',
 'eth11',
 'eth12',
 'eth13',
 'eth14',
 'eth15',
 'eth16',
 'age901',
 'age902',
 'age903',
 'age904',
 'age905',
 'age906',
 'age907',
 'chil1',
 'chil2',
 'chil3',
 'agec1',


In [16]:
drop_cols = []

for col in data:
    if len(data[col].unique()) > 85:
        display(data[col].value_counts())
        drop_cols.append(col)
        
len(drop_cols)

MBC    4539
SYN    3563
AML    3430
BHG    3324
IMP    2986
       ... 
OPR       1
AHA       1
BEL       1
MMK       1
INC       1
Name: osource, Length: 896, dtype: int64

85351     61
92653     59
85710     54
95608     50
92037     45
          ..
59732      1
73533-     1
91367-     1
64118-     1
92691-     1
Name: zip, Length: 19938, dtype: int64

0       23661
4801     1479
5001     1326
3001     1288
2801     1225
        ...  
7504        1
108         1
7312        1
7           1
7404        1
Name: dob, Length: 947, dtype: int64

50.0    1930
76.0    1885
72.0    1813
68.0    1809
74.0    1801
        ... 
15.0       1
6.0        1
10.0       1
9.0        1
8.0        1
Name: age, Length: 96, dtype: int64

0     73998
1      9292
2      4324
3      2063
4      1185
      ...  
91        1
57        1
58        1
66        1
99        1
Name: malemili, Length: 95, dtype: int64

31    4305
30    4295
32    4138
29    4119
33    4075
      ... 
81       3
83       2
80       1
90       1
98       1
Name: malevet, Length: 89, dtype: int64

0     5502
27    3075
28    3046
31    2956
29    2908
      ... 
90       5
91       4
88       3
92       2
93       2
Name: vietvets, Length: 95, dtype: int64

0     5264
32    2550
33    2477
28    2398
35    2334
      ... 
96      25
92      13
97       7
95       7
98       6
Name: wwiivets, Length: 100, dtype: int64

0        799
1086      78
923       77
1094      75
834       73
        ... 
4142       1
6191       1
12394      1
49053      1
7062       1
Name: pop901, Length: 9906, dtype: int64

0        842
296      225
265      217
261      217
281      215
        ... 
22237      1
3431       1
3495       1
5604       1
7469       1
Name: pop902, Length: 4786, dtype: int64

0        817
340      174
485      164
381      163
380      163
        ... 
5697       1
5927       1
12128      1
18285      1
4453       1
Name: pop903, Length: 5698, dtype: int64

99    50548
0     35415
98      510
97      453
95      331
      ...  
44       33
42       32
37       32
25       32
39       30
Name: pop90c1, Length: 100, dtype: int64

0     76382
99     6029
1       637
98      404
96      342
      ...  
18       65
29       63
27       62
24       56
20       53
Name: pop90c2, Length: 100, dtype: int64

0     56461
99    17284
1      1073
2       918
3       792
      ...  
68      100
75       99
82       97
73       94
71       88
Name: pop90c3, Length: 100, dtype: int64

99    15689
98     8662
97     7133
96     6092
95     4961
      ...  
14       71
20       70
19       69
17       65
18       62
Name: eth1, Length: 100, dtype: int64

0     31240
1     18193
2      9336
3      5785
4      4068
      ...  
76       44
83       42
77       38
85       37
88       37
Name: eth2, Length: 100, dtype: int64

0     37177
1     23722
2     10555
3      5691
4      3725
      ...  
85        3
91        3
99        2
93        1
95        1
Name: eth4, Length: 96, dtype: int64

1     22602
0     15803
2     11884
3      6999
4      4690
      ...  
70       34
82       32
83       23
97       22
99       13
Name: eth5, Length: 100, dtype: int64

0     34032
1     20591
2      8336
3      5230
4      3765
      ...  
94       13
93       12
95        8
97        1
96        1
Name: eth13, Length: 98, dtype: int64

11    10457
12    10058
10     9991
13     9025
9      8247
      ...  
90        2
94        2
99        1
68        1
96        1
Name: agec1, Length: 98, dtype: int64

15    12207
14    11813
16    11264
13     9905
17     8668
      ...  
91        1
92        1
95        1
65        1
98        1
Name: chilc5, Length: 97, dtype: int64

21    3390
23    3300
24    3298
22    3287
26    3262
      ... 
78      18
89      18
99      12
96       5
98       2
Name: hhage1, Length: 99, dtype: int64

19    3349
21    3305
20    3264
18    3256
22    3251
      ... 
87      14
99      11
95       7
97       2
96       2
Name: hhage3, Length: 99, dtype: int64

21    3997
17    3976
19    3950
18    3947
22    3918
      ... 
91       4
87       2
89       2
92       2
96       1
Name: hhn1, Length: 98, dtype: int64

45    3061
46    3049
47    2983
43    2976
44    2964
      ... 
90       3
92       3
91       2
93       2
94       1
Name: hhn3, Length: 96, dtype: int64

25    4130
26    3994
27    3980
23    3964
28    3885
      ... 
86       2
93       1
82       1
81       1
79       1
Name: hhn4, Length: 86, dtype: int64

66    4254
65    4208
67    4041
64    3984
63    3954
      ... 
4        5
1        5
91       3
93       2
95       2
Name: marr1, Length: 95, dtype: int64

19    7373
18    7348
20    7253
21    6694
17    5901
      ... 
87       3
84       3
90       2
85       2
86       1
Name: marr4, Length: 100, dtype: int64

179    1181
177    1162
180    1149
174    1140
178    1122
       ... 
464       1
423       1
403       1
469       1
447       1
Name: hhp1, Length: 393, dtype: int64

267    1079
260    1057
259    1056
264    1045
263    1040
       ... 
106       1
462       1
435       1
464       1
494       1
Name: hhp2, Length: 377, dtype: int64

99    9878
98    2604
77    1886
97    1883
75    1826
      ... 
5      207
7      196
9      196
10     188
6      180
Name: dw1, Length: 100, dtype: int64

99    6578
98    2531
71    1830
72    1772
74    1765
      ... 
7      273
5      270
10     267
6      261
13     253
Name: dw2, Length: 100, dtype: int64

0     16326
1      8331
2      4814
3      3788
4      3223
      ...  
89      160
91      155
83      152
90      136
93      132
Name: dw4, Length: 100, dtype: int64

0     24860
1      7401
2      4555
3      3608
4      3255
      ...  
94      124
79      122
92      119
90      115
89      104
Name: dw5, Length: 100, dtype: int64

0     34134
1      6766
2      4731
3      3722
4      3022
      ...  
84       90
91       86
88       80
99       72
92       67
Name: dw6, Length: 100, dtype: int64

0     66889
1      8319
2      5281
3      3030
4      2055
      ...  
89        1
74        1
91        1
94        1
96        1
Name: dw7, Length: 97, dtype: int64

0     74846
1      5827
2      4084
3      2289
4      1430
      ...  
67        1
81        1
82        1
86        1
94        1
Name: dw8, Length: 86, dtype: int64

0     83535
1      5681
2      1949
3       992
4       563
      ...  
80        1
91        1
72        1
94        1
75        1
Name: dw9, Length: 91, dtype: int64

0       1117
675      262
550      211
875      187
425      180
        ... 
5871       1
4012       1
110        1
4425       1
2047       1
Name: hv1, Length: 4434, dtype: int64

0       1117
625      146
547      138
642      136
571      135
        ... 
4456       1
3128       1
5272       1
5174       1
178        1
Name: hv2, Length: 4623, dtype: int64

84    2670
82    2638
85    2544
81    2530
83    2491
      ... 
3      124
5      110
4      110
2       94
6       94
Name: hu1, Length: 100, dtype: int64

16    2669
18    2652
15    2542
19    2531
14    2484
      ... 
91     124
95     110
96     109
98      94
94      94
Name: hu2, Length: 100, dtype: int64

97    10020
96     9729
95     8668
98     8416
94     7464
      ...  
16        2
10        2
13        1
5         1
12        1
Name: hu3, Length: 93, dtype: int64

3     10064
4      9718
5      8675
2      8375
6      7461
      ...  
84        2
90        2
88        1
95        1
87        1
Name: hu4, Length: 94, dtype: int64

0     29653
3      4975
2      4574
4      4439
5      3935
      ...  
99       71
96       53
95       50
97       33
98       28
Name: hu5, Length: 100, dtype: int64

38    3619
36    3424
35    3419
39    3403
37    3381
      ... 
88       2
89       2
93       2
98       1
90       1
Name: hhd1, Length: 95, dtype: int64

77    3814
78    3610
76    3605
79    3598
75    3533
      ... 
6        6
5        3
2        3
3        1
1        1
Name: hhd2, Length: 100, dtype: int64

66    2868
67    2832
65    2824
64    2820
63    2779
      ... 
96       6
3        5
2        5
97       3
1        3
Name: hhd3, Length: 99, dtype: int64

28    3653
27    3609
29    3478
25    3424
26    3413
      ... 
85       2
83       2
87       1
93       1
86       1
Name: hhd4, Length: 90, dtype: int64

89    5809
90    5732
88    5627
91    5535
87    5250
      ... 
5        7
13       7
7        6
2        6
1        5
Name: hhd5, Length: 100, dtype: int64

11    5809
10    5732
12    5627
9     5535
13    5250
      ... 
97       8
95       7
87       7
93       6
98       6
Name: hhd6, Length: 100, dtype: int64

59    4448
60    4387
58    4343
56    4172
55    4165
      ... 
87       7
89       6
91       3
93       2
92       1
Name: ethc2, Length: 95, dtype: int64

12    4383
13    4336
15    4308
14    4225
16    4145
      ... 
77      13
85      12
97       5
99       4
98       1
Name: ethc3, Length: 100, dtype: int64

0     40051
1     13769
2      5680
3      3391
4      2371
      ...  
58      140
55      134
62      123
64      120
59      115
Name: hvp1, Length: 100, dtype: int64

0     24097
1     13011
2      6773
3      4216
4      3075
      ...  
72      187
61      187
58      171
73      170
65      168
Name: hvp2, Length: 100, dtype: int64

0     8692
1     6520
99    6164
2     4937
3     3976
      ... 
77     303
62     302
70     297
72     297
66     282
Name: hvp3, Length: 100, dtype: int64

99    12148
98     3792
0      2955
97     2326
1      1811
      ...  
63      440
57      439
54      424
69      413
65      412
Name: hvp4, Length: 100, dtype: int64

99    26659
98     5650
97     3321
96     2679
95     2106
      ...  
6       212
5       208
3       191
2       167
1       106
Name: hvp5, Length: 100, dtype: int64

0     61118
1     10168
2      3653
3      2230
4      1565
      ...  
46       70
62       70
86       67
68       65
63       49
Name: hvp6, Length: 100, dtype: int64

1     18487
0     17095
2     13924
3      9691
4      6732
      ...  
80        1
93        1
79        1
96        1
97        1
Name: hur1, Length: 96, dtype: int64

41    2114
40    2091
39    2068
42    2065
36    2060
      ... 
97     286
1      278
96     276
98     220
99     163
Name: hur2, Length: 100, dtype: int64

0     17899
1      9897
2      6320
3      5162
4      4634
      ...  
99        8
92        7
90        4
93        3
95        2
Name: hupa1, Length: 96, dtype: int64

0     44554
1      6831
2      4362
3      3324
4      2617
      ...  
98       46
90       39
89       38
88       36
92       22
Name: hupa2, Length: 100, dtype: int64

0     48359
1      5028
2      2402
3      1818
4      1703
      ...  
94       16
96       16
88       15
92       15
95       12
Name: hupa3, Length: 100, dtype: int64

8     6111
9     5902
7     5902
10    5827
6     5641
      ... 
75       1
65       1
91       1
96       1
78       1
Name: hupa4, Length: 97, dtype: int64

0     36538
1      6863
2      4742
3      3918
4      3005
      ...  
93       39
90       37
99       36
96       32
98       28
Name: hupa6, Length: 100, dtype: int64

0     16042
1      7380
2      4655
3      3296
4      2615
      ...  
66      394
51      392
49      382
68      379
62      368
Name: rp1, Length: 100, dtype: int64

0     6483
1     4087
2     3250
3     2659
4     2238
      ... 
48     483
51     467
49     461
52     450
66     442
Name: rp2, Length: 100, dtype: int64

99    3826
97    3287
98    3088
96    2974
95    2647
      ... 
37     527
41     526
34     497
49     490
46     469
Name: rp3, Length: 100, dtype: int64

99    7684
98    5918
97    5305
96    4593
95    3977
      ... 
6       82
4       46
3       37
2       17
1        5
Name: rp4, Length: 100, dtype: int64

0.0       21333
4480.0     4606
1600.0     4059
2160.0     2586
520.0      1685
          ...  
2975.0        1
9140.0        1
9280.0        1
9000.0        1
1320.0        1
Name: msa, Length: 298, dtype: int64

13.0     7296
51.0     4622
65.0     3765
57.0     2836
105.0    2617
         ... 
147.0       1
161.0       1
651.0       1
103.0       1
601.0       1
Name: adi, Length: 204, dtype: int64

803.0    7296
602.0    4632
807.0    3765
505.0    2839
819.0    2588
         ... 
569.0       1
516.0       1
552.0       1
554.0       1
584.0       1
Name: dma, Length: 206, dtype: int64

0       878
263     426
313     411
213     409
258     402
       ... 
1189      1
1236      1
1300      1
1210      1
1330      1
Name: ic1, Length: 1134, dtype: int64

0       928
288     467
313     390
315     376
263     376
       ... 
1103      1
975       1
1126      1
1422      1
1407      1
Name: ic2, Length: 1213, dtype: int64

0       878
271     393
278     378
280     377
279     373
       ... 
1299      1
1008      1
49        1
83        1
1173      1
Name: ic3, Length: 1091, dtype: int64

0       928
344     370
320     350
325     347
346     346
       ... 
1258      1
1232      1
81        1
1206      1
1105      1
Name: ic4, Length: 1156, dtype: int64

0        860
22875     41
13103     27
10931     26
12577     26
        ... 
29884      1
4683       1
6730       1
21688      1
21188      1
Name: ic5, Length: 21514, dtype: int64

11    2887
10    2848
13    2789
6     2739
8     2730
      ... 
89       3
93       3
92       2
95       2
96       1
Name: ic6, Length: 99, dtype: int64

0     5579
4     4471
3     4345
5     4324
6     4195
      ... 
95       1
86       1
89       1
91       1
96       1
Name: ic15, Length: 96, dtype: int64

29    2979
24    2959
26    2940
28    2913
23    2861
      ... 
96       9
95       5
94       4
97       3
98       2
Name: hhas1, Length: 100, dtype: int64

44    2243
43    2195
42    2189
49    2128
46    2127
      ... 
91      17
93      16
95       9
97       6
96       1
Name: hhas3, Length: 99, dtype: int64

3     6631
4     6537
5     6276
2     6222
6     5281
      ... 
81       1
83       1
85       1
88       1
89       1
Name: hhas4, Length: 93, dtype: int64

41    2654
44    2637
43    2633
46    2628
40    2578
      ... 
7        7
5        4
3        2
4        1
1        1
Name: mc1, Length: 99, dtype: int64

59    2662
56    2637
57    2623
54    2619
52    2577
      ... 
94      10
93       7
95       4
97       2
96       1
Name: mc2, Length: 99, dtype: int64

5     6293
6     5913
4     5708
7     5661
8     5428
      ... 
94       3
91       2
92       1
93       1
80       1
Name: mc3, Length: 98, dtype: int64

81    4580
82    4418
80    4392
83    4280
79    4248
      ... 
6        4
3        3
2        2
4        2
13       2
Name: tpe1, Length: 99, dtype: int64

0     7250
3     5090
2     5073
4     4487
5     4274
      ... 
94       9
95       7
96       7
91       6
97       2
Name: pec2, Length: 99, dtype: int64

72    2695
71    2668
70    2665
74    2640
69    2629
      ... 
4        7
3        7
98       6
2        5
1        1
Name: tpe13, Length: 100, dtype: int64

66    3615
68    3607
67    3601
65    3569
70    3448
      ... 
2        8
98       7
97       7
3        5
1        1
Name: lfc1, Length: 100, dtype: int64

78    3724
76    3687
80    3670
79    3655
77    3621
      ... 
5       10
4        5
2        5
3        2
1        2
Name: lfc2, Length: 100, dtype: int64

58    3387
57    3304
59    3302
56    3187
55    3147
      ... 
95       8
94       6
97       5
96       5
98       1
Name: lfc3, Length: 99, dtype: int64

75    3464
76    3436
74    3313
77    3309
73    3292
      ... 
5       20
4        5
3        2
2        2
1        2
Name: lfc4, Length: 100, dtype: int64

55    3190
53    3171
54    3145
56    3099
57    3057
      ... 
92      11
96       5
94       4
95       3
97       1
Name: lfc5, Length: 98, dtype: int64

68    3152
69    3094
65    3076
67    2972
70    2955
      ... 
7        5
6        4
5        2
4        1
3        1
Name: lfc6, Length: 98, dtype: int64

55    2704
52    2659
50    2638
53    2600
54    2535
      ... 
2       20
96      17
97       8
1        8
98       1
Name: lfc7, Length: 100, dtype: int64

99    28256
0      9313
81     1355
76     1332
88     1312
      ...  
7        21
5         8
6         7
4         4
3         2
Name: lfc8, Length: 98, dtype: int64

99    46942
0     30500
89      597
92      551
93      548
      ...  
9         5
5         4
10        3
12        1
4         1
Name: lfc9, Length: 96, dtype: int64

0     28122
2      6627
3      6510
4      6019
5      5195
      ...  
83        1
72        1
86        1
89        1
91        1
Name: lfc10, Length: 91, dtype: int64

75    4134
74    4024
73    4022
72    3979
76    3951
      ... 
13       1
12       1
11       1
9        1
17       1
Name: oedc5, Length: 91, dtype: int64

23    6389
22    6349
24    6289
25    6046
21    5815
      ... 
88       2
89       2
98       1
92       1
82       1
Name: sec2, Length: 97, dtype: int64

5     12026
4     11946
6     10967
3      9960
7      9816
      ...  
85        1
91        1
95        1
96        1
87        1
Name: sec5, Length: 97, dtype: int64

0     73368
1      9538
2      4457
3      2140
4      1169
      ...  
68        1
71        1
72        1
57        1
63        1
Name: afc2, Length: 96, dtype: int64

30    4513
31    4463
32    4305
33    4227
29    4176
      ... 
81       5
84       4
83       3
90       1
80       1
Name: afc5, Length: 88, dtype: int64

0     3150
27    3103
28    3065
32    2994
29    2978
      ... 
87       5
89       4
93       2
92       1
91       1
Name: vc1, Length: 95, dtype: int64

0     6030
17    5101
18    5011
16    4995
19    4855
      ... 
86       1
79       1
82       1
84       1
85       1
Name: vc2, Length: 89, dtype: int64

0     2912
32    2572
33    2467
28    2452
36    2442
      ... 
96      27
92       8
95       7
98       6
97       5
Name: vc3, Length: 100, dtype: int64

0     16207
8      5094
7      4786
9      4663
6      4663
      ...  
89        2
94        2
93        2
91        1
96        1
Name: vc4, Length: 97, dtype: int64

0     16115
1     15847
2     10241
3      8171
4      6320
      ...  
83        3
81        2
85        2
90        2
99        1
Name: pobc1, Length: 88, dtype: int64

80    1769
79    1728
81    1712
82    1652
76    1629
      ... 
4      112
96     105
97      51
99      31
98      23
Name: pobc2, Length: 100, dtype: int64

98    9921
97    9830
99    8794
96    7976
95    6776
      ... 
6       27
4       24
3       19
2       13
1        5
Name: lsc1, Length: 100, dtype: int64

1     23054
0     19000
2     13228
3      7668
4      5246
      ...  
96       18
98       14
95       13
97       11
99        9
Name: lsc2, Length: 100, dtype: int64

99    21096
98     9549
97     8461
96     7763
95     6476
      ...  
20        3
21        2
11        2
12        1
9         1
Name: voc1, Length: 93, dtype: int64

62    2192
64    2178
61    2164
67    2159
65    2147
      ... 
3       59
98      46
4       44
2       30
1       30
Name: voc2, Length: 100, dtype: int64

0     26755
2      4281
3      4058
4      3634
5      3350
      ...  
81       36
83       30
90       29
94       27
96       25
Name: hc4, Length: 100, dtype: int64

0     18117
3      2556
2      2527
4      2397
5      2181
      ...  
86      149
84      149
87      146
91      139
97      121
Name: hc5, Length: 100, dtype: int64

0     8767
99    4613
3     1389
5     1284
4     1273
      ... 
17     671
90     655
85     636
78     634
1      392
Name: hc6, Length: 100, dtype: int64

99    11290
0      4089
98     2415
97     2067
96     1875
      ...  
24      472
25      467
21      464
2       453
1        62
Name: hc7, Length: 100, dtype: int64

0     10198
99     2880
2      2410
1      2359
3      2073
      ...  
76      471
75      468
66      468
79      465
98      457
Name: hc8, Length: 100, dtype: int64

0     69834
1      3926
2      2868
3      2300
4      1828
      ...  
77        2
84        2
78        2
90        1
80        1
Name: hc9, Length: 87, dtype: int64

0     9588
1     3464
99    2653
2     2224
94    1872
      ... 
24     482
21     480
35     467
18     443
17     416
Name: hc11, Length: 100, dtype: int64

0     35875
1     12192
2      9021
3      5248
4      3635
      ...  
81        2
91        1
88        1
89        1
87        1
Name: hc12, Length: 92, dtype: int64

0     5739
3     3077
2     3024
5     2898
4     2863
      ... 
81     311
86     308
76     299
78     289
83     285
Name: hc13, Length: 100, dtype: int64

0     52292
1      8143
2      5678
3      3861
4      2973
      ...  
82        1
83        1
85        1
92        1
99        1
Name: hc14, Length: 98, dtype: int64

0     42397
1     10201
2      7852
3      5273
4      3602
      ...  
94        1
91        1
84        1
99        1
87        1
Name: hc16, Length: 92, dtype: int64

99    49918
98     3988
0      2599
97     2545
96     1837
      ...  
16      172
21      171
11      164
14      160
20      158
Name: hc17, Length: 100, dtype: int64

0     46610
1      5254
2      3778
3      2496
4      1856
      ...  
76      162
72      160
75      157
63      157
83      131
Name: hc18, Length: 100, dtype: int64

99    35462
98     5007
0      3540
97     3445
96     2357
      ...  
44      296
20      296
35      288
49      284
39      279
Name: hc19, Length: 100, dtype: int64

A1F    15696
F1F     6704
A1G     6634
A1E     5429
A2F     5353
       ...  
U1C        1
A2B        1
A3B        1
P1A        1
I1D        1
Name: rfa_6, Length: 109, dtype: int64

A1F    10954
        8874
A1E     6602
A1G     4927
A2F     4830
       ...  
I4D        1
A2B        1
L3C        1
N4C        1
U1G        1
Name: rfa_7, Length: 106, dtype: int64

A1F    11312
A1E     6904
A1G     5063
A2F     4961
F1F     3984
       ...  
N4C        2
I3E        1
L3C        1
L4C        1
U1D        1
Name: rfa_8, Length: 109, dtype: int64

       11245
A1F     9629
A1E     7038
A1G     4259
A2F     4155
       ...  
A2B        2
A3B        2
U1D        1
N2A        1
I1E        1
Name: rfa_9, Length: 107, dtype: int64

       32748
A1F     9204
A1E     6153
A1G     3943
A2F     3516
       ...  
I4C        1
A2B        1
L3D        1
A4B        1
L4F        1
Name: rfa_10, Length: 94, dtype: int64

       10422
A1F     9745
A1E     7029
A1G     4144
A2F     3735
       ...  
S3B        8
A4B        4
S2B        4
A3B        3
A2B        2
Name: rfa_11, Length: 101, dtype: int64

A1F    9857
       8923
A1E    7121
A1G    4199
A2F    3783
       ... 
A4B       4
A3B       3
A2B       2
U1C       1
F1B       1
Name: rfa_12, Length: 107, dtype: int64

       40219
A1F     9066
A1G     3954
A2F     3281
A2E     2695
       ...  
I4C        1
F1B        1
U1F        1
U1G        1
L3G        1
Name: rfa_13, Length: 87, dtype: int64

       18867
A1F     8053
A1E     7766
A1G     3740
A1D     3577
       ...  
U1E        4
I3F        3
L2D        3
N2B        1
U1D        1
Name: rfa_14, Length: 95, dtype: int64

       20417
A1E     7080
A1F     6445
A1D     3322
A1G     3053
       ...  
U1C        2
S2A        1
I2C        1
P1C        1
I3C        1
Name: rfa_16, Length: 123, dtype: int64

       27650
A1E     6773
A1F     5328
A1D     3645
A1G     2305
       ...  
A4B        2
S4A        1
A3B        1
S2A        1
A4A        1
Name: rfa_17, Length: 118, dtype: int64

       21263
A1E     7186
A1F     5510
A1D     3941
A1G     2408
       ...  
N3B        1
A4A        1
S3A        1
P1B        1
S2A        1
Name: rfa_18, Length: 122, dtype: int64

       24492
A1E     7248
A1F     5347
A1D     4156
A1G     2330
       ...  
A4B        3
P1B        2
S4A        2
L4C        1
A4A        1
Name: rfa_19, Length: 108, dtype: int64

       35212
A1E     6729
A1F     5127
A1D     3607
A1G     2245
       ...  
S4A        2
P1B        2
S3A        2
A4B        2
A4A        1
Name: rfa_21, Length: 102, dtype: int64

       25648
A1E     7233
A1F     5324
A1D     4113
A1G     2318
       ...  
U1C        2
L4C        1
A4A        1
A1A        1
F1B        1
Name: rfa_22, Length: 117, dtype: int64

       56274
A1F     4607
A1E     4348
A1G     2401
S2E     2243
       ...  
I4F        1
L1D        1
U1C        1
U1F        1
2D         1
Name: rfa_23, Length: 87, dtype: int64

       36973
A1E     7227
A1F     5032
A1D     4563
F1D     3315
       ...  
A1C        2
U1C        2
N1C        2
L4C        1
U1G        1
Name: rfa_24, Length: 97, dtype: int64

13     2718
14     2513
15     1920
24     1824
25     1756
       ... 
167       1
144       1
169       1
177       1
166       1
Name: numprom, Length: 165, dtype: int64

15.00    3575
10.00    3387
20.00    3119
25.00    1838
5.00     1716
         ... 
5.40        1
12.94       1
21.80       1
30.91       1
31.65       1
Name: ramnt_8, Length: 108, dtype: int64

10.00     2797
15.00     2432
20.00     2063
5.00      1421
25.00     1105
          ... 
250.00       1
16.95        1
97.00        1
450.00       1
6.91         1
Name: ramnt_9, Length: 90, dtype: int64

10.00    1789
15.00    1359
20.00    1189
5.00      968
25.00     772
         ... 
66.00       1
97.00       1
61.00       1
25.28       1
80.00       1
Name: ramnt_10, Length: 92, dtype: int64

10.00    4351
15.00    3728
20.00    3180
5.00     2426
25.00    1886
         ... 
54.00       1
9.97        1
9.49        1
47.00       1
38.65       1
Name: ramnt_12, Length: 114, dtype: int64

10.00    4510
5.00     3453
15.00    3035
20.00    2525
25.00    1509
         ... 
14.90       1
4.01        1
51.00       1
76.00       1
10.01       1
Name: ramnt_14, Length: 102, dtype: int64

10.00     4332
15.00     3165
5.00      2915
20.00     2663
12.00     1561
          ... 
500.00       1
8.66         1
400.00       1
61.00        1
0.50         1
Name: ramnt_16, Length: 109, dtype: int64

10.00     4251
5.00      3343
15.00     1792
20.00     1513
7.00       968
          ... 
58.00        1
33.25        1
8.99         1
170.00       1
80.00        1
Name: ramnt_18, Length: 105, dtype: int64

10.00    4384
5.00     3116
15.00    1854
20.00    1512
7.00     1117
         ... 
88.00       1
66.00       1
20.50       1
28.42       1
83.00       1
Name: ramnt_22, Length: 92, dtype: int64

20.00      4206
25.00      3696
15.00      3395
30.00      2573
40.00      1595
           ... 
24.95         1
423.06        1
787.00        1
347.25        1
1174.00       1
Name: ramntall, Length: 2094, dtype: int64

1      10008
2       7722
3       7189
4       7019
5       6293
       ...  
85         1
86         1
237        1
87         1
116        1
Name: ngiftall, Length: 89, dtype: int64

5.00     34559
3.00     17763
10.00    14679
15.00     5735
20.00     5537
         ...  
99.00        1
8.50         1
22.67        1
3.03         1
5.07         1
Name: minramnt, Length: 191, dtype: int64

9602    3041
9601    3021
9512    2359
9509    2336
9510    2325
        ... 
8211       1
8604       1
8306       1
8410       1
8409       1
Name: minrdate, Length: 146, dtype: int64

15.00     18414
20.00     18300
25.00     12270
10.00      9405
16.00      3808
          ...  
16.59         1
24.95         1
63.33         1
177.00        1
10.01         1
Name: maxramnt, Length: 275, dtype: int64

9512    10563
9601     6812
9509     6120
9602     5875
9504     5218
        ...  
8601        1
8410        1
7910        1
8501        1
8602        1
Name: maxrdate, Length: 150, dtype: int64

15.00     17776
20.00     15838
10.00     14256
25.00     10149
5.00       4949
          ...  
137.00        1
119.47        1
20.50         1
450.00        1
15.45         1
Name: lastgift, Length: 231, dtype: int64

9501    2957
9401    2770
9310    2646
9410    2451
9601    2406
        ... 
8303       1
8305       1
8304       1
7408       1
7903       1
Name: fistdate, Length: 177, dtype: int64

9504.0    2253
9412.0    1970
8703.0    1959
9512.0    1870
8612.0    1688
          ... 
8303.0       1
8306.0       1
7705.0       1
8011.0       1
8409.0       1
Name: nextdate, Length: 188, dtype: int64

15.000000    6020
20.000000    4566
25.000000    2659
10.000000    2227
12.500000    1800
             ... 
5.959459        1
8.836735        1
10.604167       1
7.612903        1
5.518889        1
Name: avggift, Length: 7713, dtype: int64

131072    1
148834    1
134507    1
132458    1
10278     1
         ..
190984    1
168455    1
164357    1
51951     1
2047      1
Name: controln, Length: 95412, dtype: int64

165

### 5. Reduce the number of categories in the column GENDER. The column should only have either "M" for males, "F" for females, and "other" for all the rest

Note that there are a few null values in the column. We will first replace those null values using the code below:

```python
print(categorical['GENDER'].value_counts())
categorical['GENDER'] = categorical['GENDER'].fillna('F')
```

In [17]:
# Checking the data in gender column
data['gender'].value_counts()

F    51277
M    39094
      2957
U     1715
J      365
A        2
C        2
Name: gender, dtype: int64

In [23]:
print(categorical['gender'].value_counts())
categorical['gender'] = categorical['gender'].fillna('F')

F    51277
M    39094
      2957
U     1715
J      365
A        2
C        2
Name: gender, dtype: int64


In [25]:
# getting UJAC as other gender and keeping M F
data['gender'] = data['gender'].apply(lambda x: 'M' if str(x).lower().startswith('m')
                                                  else 'F' if str(x).lower().startswith('f')
                                                  else 'O')
data['gender'] = data['gender'].fillna('O')

data.gender.value_counts()

F    51277
M    39094
O     5041
Name: gender, dtype: int64