In [39]:
import numpy as np
import pandas as pd
import re

In [41]:
text = "That person wears marvelous trousers."
# pattern = 'er'
pattern = '[er]'
re.findall(pattern, text)

['e', 'r', 'e', 'r', 'r', 'e', 'r', 'e', 'r']

In [52]:
# text = "Is it spelled gray or grey?"
text = "Is it spelled gry or grey?"
pattern = 'gr[ae]*y'
re.findall(pattern, text)

['gry', 'grey']

In [None]:
A-Z -> 1-26
a-z -> 27-54

In [53]:
text = '''TKerraPower, A nuclear-energy company founded by Bill Gates,
        is unlikely to follow through on building a demonstration reactor in China,
        due largely to the Trump administration�s crackdown on the country'''

# pattern = '[A-Z][a-z]*'
# pattern = '[a-Z]+'
pattern = '[A-Z][a-z]+'
print(re.findall(pattern, text))

['Kerra', 'Power', 'Bill', 'Gates', 'China', 'Trump']


### More complex regex

- `\w`: Any alphanumeric character.
- `\W`: Any non-alphanumeric character.
- `\d`: Any numeric character.
- `\D`: Any non-numeric character.
- `\s`: Any whitespace characters.
- `\S`: Any non-whitespace characters.

In [65]:
text = "If you tell the truth, you don't have to remember anything 100."

pattern = '\w'
# pattern = '\w+'
# pattern = '\w*'
# pattern = '\w?'
# pattern = '.'
print(re.findall(pattern, text))

['I', 'f', 'y', 'o', 'u', 't', 'e', 'l', 'l', 't', 'h', 'e', 't', 'r', 'u', 't', 'h', 'y', 'o', 'u', 'd', 'o', 'n', 't', 'h', 'a', 'v', 'e', 't', 'o', 'r', 'e', 'm', 'e', 'm', 'b', 'e', 'r', 'a', 'n', 'y', 't', 'h', 'i', 'n', 'g', '1', '0', '0']


In [61]:
text = "If you tell the truth, you don't have to remember anything 100."
# pattern = '\w{4}'
pattern = '\w{4,}'
print(re.findall(pattern, text))

['tell', 'truth', 'have', 'remember', 'anything']


In [63]:
text = """
Aeromexico 800 -237- 6639
Air Canada 888-247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6567
Air Inuit 800-361-2965
Air North 800-661-0407
Air Tindi 888-545-6794"""
pattern = '\d+-\d+-\d+'
re.findall(pattern, text)

['888-247-2262',
 '888-247-2262',
 '800-567-6567',
 '800-361-2965',
 '800-661-0407',
 '888-545-6794']

### Feature Engineering

- **Feature engineering** is a process of transforming the given data into a form that is easier to interpret. All the techniques that we have used before including data transformations techniques, encoding categorical variables, scaling numerical features, imputing missing values, cleaning categorical columns, using regular expressions, using DateTime, and string functions are feature engineering techniques. The key idea is that we are manipulating the information that is available to us to be able to better understand it and improve the model.

- **Feature extraction/feature generation**: It is the process of extracting relevant information from the existing available information. For example, you are provided with the date of birth of customers in the data. You are not interested in when they were born but more interested in their age. In that case, you can use the DateTime functions to calculate the age or extract other relevant information such as the year they were born, the month they were born.


- **Feature selection**: This is the process of selecting the features/columns in your data that are relevant to the model, for eg. if it is a prediction problem, you will be more interested in only those variable that have an impact on your target and not other columns/features that are not providing any information about target. Adding variables that don't add value to the model degrades the performance of the model both in terms of accuracy and in terms of efficiency.

Some of the feature selection methods that we took a look at earlier include:

- Checking null values to drop a column
- Sense check to drop columns that are not significant
- Using heat maps to check multicollinearity for numerical variables
- Chi-square tests for categorical variables

In [66]:
data = pd.read_csv('4.01_healthcare.csv')

In [67]:
data.shape

(90569, 28)

In [69]:
data.head()

Unnamed: 0,STATE,PVASTATE,DOB,MDMAUD,RECP3,GENDER,DOMAIN,INCOME,HOMEOWNR,HV1,...,VETERANS,NUMPROM,CARDPROM,CARDPM12,NUMPRM12,MAXADATE,RFA_2,NGIFTALL,TIMELAG,AVGGIFT
0,IL,,3712,XXXX,,F,T2,,,479,...,,74,27,6,14,9702,L4E,31,4.0,7.741935
1,CA,,5202,XXXX,,M,S1,6.0,H,5468,...,,32,12,6,13,9702,L2G,3,18.0,15.666667
2,NC,,0,XXXX,,M,R2,3.0,U,497,...,,63,26,6,14,9702,L4E,27,12.0,7.481481
3,CA,,2801,XXXX,,F,R2,1.0,U,1000,...,,66,27,6,14,9702,L4E,16,9.0,6.8125
4,FL,,2001,XXXX,X,F,S2,3.0,H,576,...,,113,43,10,25,9702,L2F,37,14.0,6.864865


In [68]:
len(data[data['DOB']==0])

22507

In [79]:
'-'.join(['a','b','c'])

'a-b-c'

In [77]:
x = '2-019'
pattern = '\d\d'
re.findall(pattern,x)

['01']

In [70]:
def year(x):
    x = str(x)
    if len(x)<4:
        return np.nan
    else:
        pattern = '\d\d'
        yr = re.findall(pattern,x)[0]
        return int(yr)


Note that our reference year is 1997 here as the data is from that study.


In [71]:
# data['year'] = list(map(year, data['DOB']))
data['year'] = data['DOB'].apply(lambda x: year(x))
data['year'] = 97 - data['year']

data['year'] = data['year'].fillna(np.mean(data['year']))

# Now we can drop the column DOB as we have extracted the information we need from this column
data = data.drop(['DOB'], axis=1)

In [72]:
data.head()

Unnamed: 0,STATE,PVASTATE,MDMAUD,RECP3,GENDER,DOMAIN,INCOME,HOMEOWNR,HV1,HV2,...,NUMPROM,CARDPROM,CARDPM12,NUMPRM12,MAXADATE,RFA_2,NGIFTALL,TIMELAG,AVGGIFT,year
0,IL,,XXXX,,F,T2,,,479,635,...,74,27,6,14,9702,L4E,31,4.0,7.741935,60.0
1,CA,,XXXX,,M,S1,6.0,H,5468,5218,...,32,12,6,13,9702,L2G,3,18.0,15.666667,45.0
2,NC,,XXXX,,M,R2,3.0,U,497,546,...,63,26,6,14,9702,L4E,27,12.0,7.481481,59.51171
3,CA,,XXXX,,F,R2,1.0,U,1000,1263,...,66,27,6,14,9702,L4E,16,9.0,6.8125,69.0
4,FL,,XXXX,X,F,S2,3.0,H,576,594,...,113,43,10,25,9702,L2F,37,14.0,6.864865,77.0


## Multicollinearity

Through multivariate linear regression, we are trying to assess the influence of each of the predictor variables on the target variable. This influence/relationship is linear and is represented by a mathematical equation. The equation is given as: `Y=β0+β1X1+β2X2+β3X3+β4X4+........+βnXn`.
Here, each variable is trying to explain some information about the nature of `Y`, how does `Y` change with each of the predictor variables. The change in `Y` with `X` is technically variation. When the predictor variables are all independent of each other, each variable explains some information on the change in `Y`. Multicollinearity arises when the predictor variables are highly correlated. Hence some predictors are redundant as they do not reveal any new information on the change in `Y` with a change in `X`.

The correlation matrix using the heat maps helps us understand the correlation between the independent variables.
Using the `sklearn.metrics` module we calculate `R` square statistic. It measures the proportion of variance in the dependent variable that is explained by all of the independent variables.

For checking multicollinearity, we calculate `R` square `k` and `VIF` (variance inflation factor) for each of the `k` independent variables. We do this by regressing the `k`-th independent variable on all of the other independent variables. That is, we treat `X` `k` as the dependent variable and use the other independent variables to predict `X` `k`.

For eg. `Y=β0+β1X1+β2X2+β3X3+β4X4`

Build a model `X1` vs. `X2 X3 X4`, find `R^2`, call it `R1`.
Build a model `X2` vs. `X1 X3 X4`, find `R^2`, call it `R2`.
and so on and so forth

**Interpreting R square k** - If `R2k `equals zero, variable `k` is not correlated with any other independent variable.
Usually, multicollinearity is a potential problem when `R2k` is greater than `0.75` and, a serious problem when `R2k` is greater than `0.9`.

For each variable that we find individual R2, `VIF = 1 / ( 1 - R2k )` (for each dependent variable `k`). It is used to assess multicollinearity.

**Interpretation of the variance inflation factor**: If `VIFk = 1`, variable k is not correlated with any other independent variable. Multicollinearity is a potential problem when `VIFk` is greater than 4 and, a serious problem when it is greater than 10.

### Effects of Multicollinearity:

1. It makes it harder to interpret the significance of variables in the regression model (we will talk about statistical significance/p-value later).
2. It might give good enough results due to over-fitting, but those will not be very reliable (we will talk about over-fitting and under-fitting in more detail later. Over-fitting for now, you can explain to the students as a more complicated model and not very generalized ie it might work on the data at hand very well but not so well on the unknown/out of the box data.
3. It is also important to note that it does not severely impact the model in terms of predicting power if the only prediction is the main goal of the analysis.

Dealing with High Multicollinearity:

1. Centering/standardizing/normalizing variables may help reduce multicollinearity.
2. Removing one or more of the variables that are highly correlated with each other.


## VIF (Variance Inflation Factor)
Additional resource [here](https://etav.github.io/python/vif_factor_python.html)

In [126]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


# This is the code to show how to use the __variance_inflation_factor__ function
# We are using add_constant here as VIF method in python expects the addition of a constant terms in the X features. It uses OLS but does not add the constant itself.
# data_corr = data.copy().dropna()

# # vif = {}
# data_corr = add_constant(data_corr)
# print(data_corr.shape)
# for i in np.arange(data_corr.shape[1]):
#     column_name = data_corr.columns[i]
#     print(i, column_name)
#     value = variance_inflation_factor(np.array(data_corr), i)
#     vif[column_name] = value

# Code to use the variance_inflation_factor technique to remove highly correlated columns

# flag = True
# threshold = 50
# data_corr = add_constant(data_corr)
# while flag is True:
#     #print(data_corr.head())
#     flag = False
#     values = [variance_inflation_factor(np.array(data_corr), i) for i in np.arange(data_corr.shape[1])]
#     #print(values)
#     if max(values)> threshold:
#         col_index = values.index(max(values))
#         column_name = data_corr.columns[col_index]
#         data_corr = data_corr.drop([column_name], axis=1)
#         flag = True


In [111]:
vif

{'const': 0.0,
 'INCOME': inf,
 'HV1': inf,
 'HV2': inf,
 'HV3': inf,
 'HV4': inf,
 'IC1': inf,
 'IC2': inf,
 'IC3': inf,
 'IC4': inf,
 'IC5': inf,
 'NUMPROM': inf,
 'CARDPROM': inf,
 'CARDPM12': inf,
 'NUMPRM12': inf,
 'MAXADATE': inf,
 'NGIFTALL': inf,
 'TIMELAG': inf,
 'AVGGIFT': inf,
 'year': inf}

In [91]:
data.select_dtypes(include=np.number).columns

Index(['INCOME', 'HV1', 'HV2', 'HV3', 'HV4', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5',
       'NUMPROM', 'CARDPROM', 'CARDPM12', 'NUMPRM12', 'MAXADATE', 'NGIFTALL',
       'TIMELAG', 'AVGGIFT', 'year'],
      dtype='object')

In [101]:
df = data.copy()
df = df.dropna()[['INCOME', 'HV1', 'HV2', 'HV3', 'HV4','AVGGIFT']]

In [105]:
from patsy import dmatrices

features = "+".join(['INCOME', 'HV1', 'HV2', 'HV3', 'HV4'])

# get y and X dataframes based on this regression:
y, X = dmatrices('AVGGIFT ~' + features, df, return_type='dataframe')

In [122]:
X

Unnamed: 0,Intercept,INCOME,HV1,HV2,HV3,HV4
1,1.0,6.0,5468.0,5218.0,12.0,10.0
2,1.0,3.0,497.0,546.0,2.0,1.0
3,1.0,1.0,1000.0,1263.0,2.0,1.0
4,1.0,3.0,576.0,594.0,4.0,3.0
6,1.0,4.0,484.0,519.0,3.0,3.0
...,...,...,...,...,...,...
90559,1.0,4.0,977.0,1180.0,5.0,5.0
90560,1.0,6.0,922.0,1244.0,4.0,4.0
90562,1.0,1.0,341.0,421.0,2.0,1.0
90564,1.0,6.0,733.0,875.0,4.0,3.0


In [106]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [107]:
vif.round(1)

Unnamed: 0,VIF Factor,features
0,7.2,Intercept
1,1.2,INCOME
2,76.3,HV1
3,77.3,HV2
4,9.3,HV3
5,8.7,HV4
