In [1]:
import pandas as pd

donors_tar = pd.read_csv('target.csv')
donors_num = pd.read_csv('categorical.csv')
donors_cat = pd.read_csv('numerical.csv')

Donors = pd.concat([donors_tar,donors_cat, donors_num], axis=1)

In [2]:
donors_tar

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
...,...,...
95407,0,0.0
95408,0,0.0
95409,0,0.0
95410,1,18.0


In [3]:
X = Donors.drop(['TARGET_B','TARGET_D'], axis = 1) #boston features
y = Donors['TARGET_B'] #Labels 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
train_set = pd.concat([X_train, y_train], axis=1)

#### Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

In [6]:
from sklearn.utils import resample # WHY we are just oversampling the train one?

no_donate = train_set[train_set['TARGET_B'] == 0] #we are splitting in the majority and the minority 
yes_donate = train_set[train_set['TARGET_B'] == 1] # this is the minority 

print(no_donate.shape, yes_donate.shape)

(63369, 338) (3419, 338)


In [7]:
from sklearn.utils import resample
yes_donate_upsampled = resample(yes_donate, 
                                    replace=True,
                                    n_samples = len(no_donate),
                                    random_state=42)

In [8]:
print(no_donate.shape)
print(yes_donate_upsampled.shape)

(63369, 338)
(63369, 338)


In [9]:
# Now we have the same numbers of values in both categories. 0 and 1. 

In [10]:
trainset_ups = pd.concat([no_donate, yes_donate_upsampled], axis=0)

In [11]:
trainset_ups.shape

(126738, 338)

In [12]:
X_trainset_ups = trainset_ups.drop(['TARGET_B'], axis = 1) #boston features
y_trainset_ups = trainset_ups['TARGET_B'] #Labels 

In [13]:
categorial_heads = X_trainset_ups.select_dtypes(include=['object']).columns.tolist()
categorial_heads

['STATE', 'HOMEOWNR', 'GENDER', 'RFA_2R', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A']

In [14]:
display(X_trainset_ups.shape)
display(y_trainset_ups.shape)
display(X_test.shape) # We oversampled just in train set, why?
display(y_test.shape) # We oversampled just in train set, why?

(126738, 337)

(126738,)

(28624, 337)

(28624,)

In [15]:
import numpy as np

In [16]:
X_trainset_ups_num = X_trainset_ups.select_dtypes(np.number)
X_trainset_ups_cat = X_trainset_ups.select_dtypes(object) 
### Why Should not create a test_set? test_set = X_test y_test
## Because we are not going to feed the model with that, right?
# The model will be trained with an upbanlance and proccesed(encode and scaled). WHY? 

In [17]:
X_test_num = X_test.select_dtypes(np.number) 
X_test_cat = X_test.select_dtypes(object)

In [18]:
X_trainset_ups_num

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
33476,1,54.0,7,2,4,0,34,23,49,2,...,44,1,95,10,92,10,95,10,92,1
87878,28,28.0,6,9,0,3,36,39,18,7,...,70,1,96,3,96,3,96,3,96,3
4784,28,74.0,5,9,0,5,36,32,33,4,...,24,1,93,12,96,3,96,3,87,1
85573,1,60.0,6,8,0,1,37,40,22,5,...,38,1,95,2,95,10,95,10,95,2
68933,0,80.0,4,9,12,0,45,17,63,5,...,18,1,95,6,95,6,95,6,95,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39226,0,75.0,6,6,21,6,41,46,27,14,...,23,1,96,2,96,2,96,2,96,2
23926,0,60.0,5,9,0,0,23,36,30,4,...,38,1,92,5,92,3,95,6,86,7
12881,2,62.0,1,9,0,0,37,31,48,8,...,36,1,87,5,92,4,96,2,86,8
62494,0,71.0,2,9,0,0,19,39,25,3,...,27,2,88,9,94,6,95,8,88,9


#### One Hot Encode 

In [19]:
#ENCODING Train
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_trainset_ups_cat)
encoded_categorical = encoder.transform(X_trainset_ups_cat).toarray()
X_trainset_ups_cat_encoded = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out()) # needed to avoid error

In [20]:
#ENCODING TEST
encoded_categorical = encoder.transform(X_test_cat).toarray()
X_test_cat_encoded = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out())

#### StandardScaler

In [21]:
from sklearn.preprocessing import StandardScaler

In [23]:
# SCALING Train
transformer_train = StandardScaler().fit(X_trainset_ups_num)
x_standardized = transformer_train.transform(X_trainset_ups_num)

X_trainset_ups_num_scaled = pd.DataFrame(x_standardized, columns=X_trainset_ups_num.columns)

In [25]:
# SCALING Test
# Why we don't use here another transfromer. transformer_test = StandardScaler().fit(X_test_num)?
x_standardized = transformer_train.transform(X_test_num)

X_test_num_scaled = pd.DataFrame(x_standardized, columns=X_test_num.columns)

In [26]:
X_train_processed = pd.concat([X_trainset_ups_cat_encoded, X_trainset_ups_num_scaled], axis=1)
X_test_processed = pd.concat([X_test_cat_encoded, X_test_num_scaled], axis=1)

In [27]:
y_trainset_ups.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [28]:
X_train_processed.shape

(126738, 354)

In [29]:
X_test_processed.shape

(28624, 354)

### RANDOM FOREST WITHOUT SELECTING FEATURES. 

In [None]:
#DecisionTree does not need be feed by scaled data. 

In [30]:
y_trainset_ups.shape

(126738,)

In [32]:
y_train.shape

(66788,)

In [33]:
y_test.shape

(28624,)

In [31]:
X_train_processed.shape

(126738, 354)

In [None]:
X_test_processed.shape

In [34]:
## ValueError: Found input variables with inconsistent numbers of samples: [126738, 66788]
# 1124 check_consistent_length(X, y)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train_processed, y_trainset_ups)
print('TrainSet = ',clf.score(X_train_processed, y_trainset_ups))
print('TestSet = ',clf.score(X_test_processed, y_test))

TrainSet =  0.6262525840710758
TestSet =  0.6072177193963108


In [None]:
# If the score is close to each other, it is working pretty well. 

### FEATURES SELECTION

In [None]:
# Variance: is the average of square of the average distance to the mean

In [None]:
#### If I insert the X2 it is already upbalance so the mean variance is so so low. 
### So, threshold is not goint to remove anything
## I have to scalate the columns (just numericals?) from the dataframe NOT upsammpled. In the lesson just numerical are fitted. 
# I need to split the train_set (should I include the test one?, I mean the whole dataset?) in cat and num to scaleted them and fit in the threshold. 

In [None]:
# I did everything with the ups ones, that's why I need to 

In [None]:
## I SHOULD NOT APPLY THE FEATURE SELECTION METHOD WITH UPBALANCE SET, 
# because it would be based on artificial set instead the real one. 
# I need to repeat again esc and endcode becasuse I did before after upbalance. 

In [35]:
X_train_set_num = train_set.select_dtypes(np.number)

In [36]:
X_train_set_num = X_train_set_num.drop(['TARGET_B'], axis=1)

In [None]:
#SCALATION. 

In [38]:
transformer_2 = StandardScaler().fit(X_train_set_num)
x_standardized = transformer_2.transform(X_train_set_num)
X_train_set_num_scaled = pd.DataFrame(x_standardized, columns=X_train_set_num.columns)

In [1]:
# I am goint to use just numerical because in the lesson Jan applied just numericals. 
# Maybe because in that dataset has just numericals? 

In [39]:
from sklearn.feature_selection import VarianceThreshold
var_threshold = 0.02

# by default var_treshold is 0. But we want to remove the rows are constant and the ones are almost constant too.   
# In practise we would scale, the columns first, and then apply threshold, or apply different treshold for different columns.
sel = VarianceThreshold(threshold=var_threshold) 

#### two parameters: threshold and variance_threshold.
### threshold parameter is the value that will be used to determine whether or not a feature should be included in the model.          
## which means that any features with an absolute difference between their mean and standard deviation greater than 
# 0.02 (at least 2% of variation among all features) are not included in the model, they will be removed.

sel = sel.fit(X_train_set_num_scaled) 
### Reviewing the lesson. I nor sure if I should to insert just the train or whole numericals scales 
## with a variable that concat. X_train_num_scaled + X_test_num_scaled. 
# However after encode categorical they are al well numbers, so....I am confused.

## Next, we create another variable called temp using sel as its input function and 
# then transform our training data into a DataFrame object using pd-learn library functions like fit() and transform().

temp = sel.transform(X_train_set_num_scaled)
temp = pd.DataFrame(temp)

print(X_train_set_num_scaled.shape)
print(temp.shape)

(66788, 330)
(66788, 330)


In [40]:
pd.DataFrame(temp).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,320,321,322,323,324,325,326,327,328,329
0,-0.0547,-0.52446,1.686015,-1.873265,0.08026,-0.207965,0.310681,-0.444573,0.92477,-1.110277,...,0.687803,-0.622058,0.9489,0.93245,-1.346385,0.766742,-0.788747,0.784328,0.219517,-1.279678
1,-0.027146,-2.323864,1.097775,0.717605,-0.361279,0.402718,0.485408,0.61562,-0.829593,0.03643,...,1.864638,-0.622058,1.322158,-0.864799,0.950893,-0.98446,1.116868,-0.864639,1.46101,-0.776055
2,-0.055721,-0.040005,1.097775,0.717605,-0.361279,-0.207965,-0.475592,-0.312049,0.698401,0.03643,...,0.325699,2.67941,-0.917391,0.93245,-0.197746,0.51657,-0.788747,1.019894,-1.332349,-0.524243
3,-0.027146,0.859697,0.509535,0.717605,-0.361279,0.80984,0.485408,0.151785,0.019292,-0.651594,...,-0.217456,-0.622058,0.202384,1.44595,0.950893,-0.98446,1.116868,-0.864639,-1.332349,-1.279678
4,-0.0547,-0.109213,1.097775,0.347481,-0.361279,-0.004404,0.572771,0.681882,-0.603224,-0.422253,...,0.416225,-0.622058,0.9489,-1.121549,0.376574,0.766742,-0.788747,0.784328,1.150637,-1.027866


#### I am going to run the same columns from the file review because I got 0.

In [41]:
#sel.variances_ > var_threshold
#sel.get_support()
#var_list = list(sel.get_support())
#len(var_list)
##  The code would return a list of all the variances that are less than or equal to the threshold.
# I don't understand this section. 

330

In [43]:
#len(X_train_set_num_scaled.columns)

330

In [44]:
#len(list(zip(X_train_set_num_scaled.columns,var_list)))

330

In [45]:
#drop_list = [col[0] for col in zip(X_train_set_num_scaled.columns,var_list) if col[1] == False]
#len(drop_list)

##The code attempts to be used to find a list of columns with support that 
# are greater than the variable threshold.

0

In [None]:
#??????

In [None]:
['OSOURCE','SOLIH','VETERANS','ZIP','Unnamed: 0']

In [48]:
X_train_processed2 = X_train_processed.drop(drop_list, axis = 1)
X_train_processed2

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.719928,-0.637765,0.971393,0.930427,-1.248993,0.764108,-0.839699,0.806585,0.293274,-1.293844
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.924882,-0.637765,1.343881,-0.878347,0.961829,-0.985003,1.007773,-0.821131,1.540983,-0.786380
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.206961,-0.637765,0.226418,1.447219,0.961829,-0.985003,1.007773,-0.821131,-1.266362,-1.293844
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.441861,-0.637765,0.971393,-1.136743,0.409124,0.764108,-0.839699,0.806585,1.229055,-1.040112
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.485027,-0.637765,0.971393,-0.103158,0.409124,-0.235384,-0.839699,-0.123538,1.229055,-0.025183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.253305,-0.637765,1.343881,-1.136743,0.961829,-1.234876,1.007773,-1.053662,1.540983,-1.040112
126734,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.441861,-0.637765,-0.146070,-0.361555,-1.248993,-0.985003,-0.839699,-0.123538,-1.578289,0.228549
126735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.349172,-0.637765,-2.008509,-0.361555,-1.248993,-0.735130,1.007773,-1.053662,-1.578289,0.482281
126736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.067927,-0.315202,-1.636021,0.672030,-0.143582,-0.235384,-0.839699,0.341523,-0.954435,0.736013


In [49]:
X_test_processed2 = X_test_processed.drop(drop_list, axis = 1)
X_test_processed2.

SyntaxError: invalid syntax (1523850200.py, line 2)

## Looking at multicolinearity

In [55]:
Xy_train= pd.concat([X_train_processed2, y_train], axis=1)
Xy_train.shape

(126738, 355)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = Xy_train.corr(method = 'pearson')
fig, ax = plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr_matrix, annot = True)
plt.show()

In [None]:
col_to_drop = ['HVP1','HVP2','HVP3','HVP5','HVP6','HV2']

In [None]:
X_train = X_train_all.drop(['HVP1','HVP2','HVP3','HVP5','HVP6','HV2'], axis = 1)
X_train.shape
X_test = X_test_all.drop(['HVP1','HVP2','HVP3','HVP5','HVP6','HV2'], axis = 1)
X_test

In [None]:
X_train.columns

In [None]:
X_test.columns

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print('TrainSet = ',clf.score(X_train, y_train))
print('TestSet = ',clf.score(X_test, y_test))

In [None]:
y_train.value_counts()

In [None]:
plot_confusion_matrix(clf, X_test, y_test,cmap=plt.cm.Blues)  
plt.show()

In [None]:
X_train_num_scaled = X_train_num_scaled.drop(drop_list, axis = 1)
X_train_num_scaled

In [None]:
X2 = pd.concat([X_train_processed,y_trainset_ups], axis=1)
X2.reset_index(drop=True, inplace=True)

In [None]:
X = Donors.drop(['TARGET_B','TARGET_D'], axis = 1) #boston features
y = Donors['TARGET_B'] #Labels 

In [None]:
X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object) 

In [None]:
#ONE HOT ENCODED
encoded_categorical = encoder.transform(X_cat).toarray()
X_cat_encoded = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out()) 

In [None]:
#
x_standardized = transformer.transform(X_num)
X_num_scaled = pd.DataFrame(x_standardized, columns=X_num.columns)

In [None]:
X_trans = pd.concat([X_cat_encoded,X_num_scaled], axis=1)

In [None]:
print('TrainSet = ',clf.score(X_trans, y))

In [None]:
predictions = clf.predict(X_trans) # Here I am using the random forest. 
predictions

In [None]:
X_trans['Predictions'] = predictions

In [None]:
X_trans

#### Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

In [None]:
# Run the random fetures #selecting feature by treshold #rerun random fores 

#### Re-run the Random Forest algorithm to determine if the Feature Selection has improved the results.

#### Discuss the output and its impact in the business scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?