# Lab | Revisiting Machine Learning Case Study

In this lab, you will use learningSet.csv file which you already have cloned in today's activities.

In [31]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

In [90]:
data = pd.read_csv('learningSet.csv')

In [91]:
data.shape

(95412, 481)

In [92]:
list(data.columns)

['ODATEDW',
 'OSOURCE',
 'TCODE',
 'STATE',
 'ZIP',
 'MAILCODE',
 'PVASTATE',
 'DOB',
 'NOEXCH',
 'RECINHSE',
 'RECP3',
 'RECPGVG',
 'RECSWEEP',
 'MDMAUD',
 'DOMAIN',
 'CLUSTER',
 'AGE',
 'AGEFLAG',
 'HOMEOWNR',
 'CHILD03',
 'CHILD07',
 'CHILD12',
 'CHILD18',
 'NUMCHLD',
 'INCOME',
 'GENDER',
 'WEALTH1',
 'HIT',
 'MBCRAFT',
 'MBGARDEN',
 'MBBOOKS',
 'MBCOLECT',
 'MAGFAML',
 'MAGFEM',
 'MAGMALE',
 'PUBGARDN',
 'PUBCULIN',
 'PUBHLTH',
 'PUBDOITY',
 'PUBNEWFN',
 'PUBPHOTO',
 'PUBOPP',
 'DATASRCE',
 'MALEMILI',
 'MALEVET',
 'VIETVETS',
 'WWIIVETS',
 'LOCALGOV',
 'STATEGOV',
 'FEDGOV',
 'SOLP3',
 'SOLIH',
 'MAJOR',
 'WEALTH2',
 'GEOCODE',
 'COLLECT1',
 'VETERANS',
 'BIBLE',
 'CATLG',
 'HOMEE',
 'PETS',
 'CDPLAY',
 'STEREO',
 'PCOWNERS',
 'PHOTO',
 'CRAFTS',
 'FISHER',
 'GARDENIN',
 'BOATS',
 'WALKER',
 'KIDSTUFF',
 'CARDS',
 'PLATES',
 'LIFESRC',
 'PEPSTRFL',
 'POP901',
 'POP902',
 'POP903',
 'POP90C1',
 'POP90C2',
 'POP90C3',
 'POP90C4',
 'POP90C5',
 'ETH1',
 'ETH2',
 'ETH3',
 'ETH4',
 'ET

### 1. Check for null values in all the columns

In [57]:
null_value = []
null_value_percent = []
for col in data.columns:
    null_value.append(data[col].isna().sum())
    null_value_percent.append(data[col].isna().sum()*100/len(data))

In [58]:
df_null_values = pd.DataFrame(list(zip(list(data.columns),null_value,null_values_percent)), 
                              columns = ['column','null_values', 'null_values_percent'])

In [59]:
df_null_values.sort_values(by='null_values',ascending=False)

Unnamed: 0,column,null_values,null_values_percent
414,RDATE_5,95403,99.990567
436,RAMNT_5,95403,99.990567
412,RDATE_3,95170,99.746363
434,RAMNT_3,95170,99.746363
413,RDATE_4,95131,99.705488
...,...,...,...
168,ETHC3,0,0.000000
167,ETHC2,0,0.000000
166,ETHC1,0,0.000000
165,HHD12,0,0.000000


### 2. Exclude the following variables by looking at the definitions. Create a new empty list called drop_list. We will append this list and then drop all the columns in this list later:

- `OSOURCE` - symbol definitions not provided, too many categories
- `ZIP CODE` - we are including state already


In [94]:
drop_list = ['OSOURCE', 'ZIP']

### 3. Identify columns that over 85% missing values

In [95]:
too_many_missing = df_null_values[df_null_values['null_values_percent'] > 85]
too_many_missing.shape

(25, 3)

In [96]:
drop_list = drop_list + list(too_many_missing['column'])
drop_list

['OSOURCE',
 'ZIP',
 'NUMCHLD',
 'RDATE_3',
 'RDATE_4',
 'RDATE_5',
 'RDATE_6',
 'RDATE_7',
 'RDATE_10',
 'RDATE_13',
 'RDATE_15',
 'RDATE_17',
 'RDATE_20',
 'RDATE_21',
 'RDATE_23',
 'RAMNT_3',
 'RAMNT_4',
 'RAMNT_5',
 'RAMNT_6',
 'RAMNT_7',
 'RAMNT_10',
 'RAMNT_13',
 'RAMNT_15',
 'RAMNT_17',
 'RAMNT_20',
 'RAMNT_21',
 'RAMNT_23']

In [97]:
len(drop_list)

27

### 4. Remove those columns from the dataframe

In [186]:
data_2 = data.copy()

In [187]:
for col in drop_list:
    data_2.drop(col, axis=1, inplace=True)

In [188]:
data_2.shape

(95412, 454)

### 5. Reduce the number of categories in the column GENDER. The column should only have either "M" for males, "F" for females, and "other" for all the rest

Note that there are a few null values in the column. We will first replace those null values using the code below:

```python
print(categorical['GENDER'].value_counts())
categorical['GENDER'] = categorical['GENDER'].fillna('F')
```

In [189]:
data_2['GENDER'].value_counts()

F    51277
M    39094
      2957
U     1715
J      365
C        2
A        2
Name: GENDER, dtype: int64

In [190]:
data_2['GENDER'] = data_2['GENDER'].fillna('F')

In [191]:
others = ['U','J','C','A']

In [192]:
for i in others:
    data_2['GENDER'] = data_2['GENDER'].replace(i, 'other')

In [193]:
data_2['GENDER'].value_counts()

F        51277
M        39094
          2957
other     2084
Name: GENDER, dtype: int64

In [194]:
data_2['GENDER'] = data_2['GENDER'].replace(' ', 'F')
data_2['GENDER'].value_counts()

F        54234
M        39094
other     2084
Name: GENDER, dtype: int64