In [1]:
import pandas as pd

donors_tar = pd.read_csv('target.csv')
donors_num = pd.read_csv('categorical.csv')
donors_cat = pd.read_csv('numerical.csv')

Donors = pd.concat([donors_tar,donors_cat, donors_num], axis=1)

In [2]:
donors_tar

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
...,...,...
95407,0,0.0
95408,0,0.0
95409,0,0.0
95410,1,18.0


In [3]:
X = Donors.drop(['TARGET_B','TARGET_D'], axis = 1) #boston features
y = Donors['TARGET_B'] #Labels 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
train_set = pd.concat([X_train, y_train], axis=1)

#### Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

In [6]:
from sklearn.utils import resample # WHY we are just oversampling the train one?

no_donate = train_set[train_set['TARGET_B'] == 0] #we are splitting in the majority and the minority 
yes_donate = train_set[train_set['TARGET_B'] == 1] # this is the minority 

print(no_donate.shape, yes_donate.shape)

(63369, 338) (3419, 338)


In [7]:
from sklearn.utils import resample
yes_donate_upsampled = resample(yes_donate, 
                                    replace=True,
                                    n_samples = len(no_donate),
                                    random_state=42)

In [8]:
print(no_donate.shape)
print(yes_donate_upsampled.shape)

(63369, 338)
(63369, 338)


In [9]:
# Now we have the same numbers of values in both categories. 0 and 1. 

In [10]:
trainset_ups = pd.concat([no_donate, yes_donate_upsampled], axis=0)

In [11]:
trainset_ups.shape

(126738, 338)

In [12]:
X_trainset_ups = trainset_ups.drop(['TARGET_B'], axis = 1) #boston features
y_trainset_ups = trainset_ups['TARGET_B'] #Labels 

In [13]:
categorial_heads = X_trainset_ups.select_dtypes(include=['object']).columns.tolist()
categorial_heads

['STATE', 'HOMEOWNR', 'GENDER', 'RFA_2R', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A']

In [14]:
display(X_trainset_ups.shape)
display(y_trainset_ups.shape)
display(X_test.shape) # We oversampled just in train set, why?
display(y_test.shape) # We oversampled just in train set, why?

(126738, 337)

(126738,)

(28624, 337)

(28624,)

In [15]:
import numpy as np

In [16]:
X_trainset_ups_num = X_trainset_ups.select_dtypes(np.number)
X_trainset_ups_cat = X_trainset_ups.select_dtypes(object) 
### Why Should not create a test_set? test_set = X_test y_test
## Because we are not going to feed the model with that, right?
# The model will be trained with an upbanlance and proccesed(encode and scaled). WHY? 

In [17]:
X_test_num = X_test.select_dtypes(np.number) 
X_test_cat = X_test.select_dtypes(object)

In [18]:
X_trainset_ups_num

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
33476,1,54.0,7,2,4,0,34,23,49,2,...,44,1,95,10,92,10,95,10,92,1
87878,28,28.0,6,9,0,3,36,39,18,7,...,70,1,96,3,96,3,96,3,96,3
4784,28,74.0,5,9,0,5,36,32,33,4,...,24,1,93,12,96,3,96,3,87,1
85573,1,60.0,6,8,0,1,37,40,22,5,...,38,1,95,2,95,10,95,10,95,2
68933,0,80.0,4,9,12,0,45,17,63,5,...,18,1,95,6,95,6,95,6,95,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39226,0,75.0,6,6,21,6,41,46,27,14,...,23,1,96,2,96,2,96,2,96,2
23926,0,60.0,5,9,0,0,23,36,30,4,...,38,1,92,5,92,3,95,6,86,7
12881,2,62.0,1,9,0,0,37,31,48,8,...,36,1,87,5,92,4,96,2,86,8
62494,0,71.0,2,9,0,0,19,39,25,3,...,27,2,88,9,94,6,95,8,88,9


#### One Hot Encode 

In [19]:
#ENCODING Train
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_trainset_ups_cat)
encoded_categorical = encoder.transform(X_trainset_ups_cat).toarray()
X_trainset_ups_cat_encoded = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out()) # needed to avoid error

In [20]:
#ENCODING TEST
encoded_categorical = encoder.transform(X_test_cat).toarray()
X_test_cat_encoded = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out())

#### StandardScaler

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
# SCALING Train
transformer_train = StandardScaler().fit(X_trainset_ups_num)
x_standardized = transformer_train.transform(X_trainset_ups_num)

X_trainset_ups_num_scaled = pd.DataFrame(x_standardized, columns=X_trainset_ups_num.columns)

In [23]:
# SCALING Test
# Why we don't use here another transfromer. transformer_test = StandardScaler().fit(X_test_num)?
x_standardized = transformer_train.transform(X_test_num)

X_test_num_scaled = pd.DataFrame(x_standardized, columns=X_test_num.columns)

In [24]:
X_train_ups_processed = pd.concat([X_trainset_ups_cat_encoded, X_trainset_ups_num_scaled], axis=1)
X_test_processed = pd.concat([X_test_cat_encoded, X_test_num_scaled], axis=1)

In [25]:
X_train_ups_processed.reset_index(drop=True, inplace=True)
X_test_processed.reset_index(drop=True, inplace=True)

In [26]:
y_trainset_ups.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [27]:
X_train_ups_processed.shape

(126738, 354)

In [28]:
X_test_processed.shape

(28624, 354)

### RANDOM FOREST UPBALANCE AND PROCESSED WITHOUT SELECTING FEATURES. 

In [29]:
# DecisionTree does not need be feed by scaled data. 

In [30]:
y_trainset_ups.shape

(126738,)

In [31]:
y_train.shape

(66788,)

In [32]:
y_test.shape

(28624,)

In [35]:
X_train_ups_processed.shape

(126738, 354)

In [34]:
X_test_processed.shape

(28624, 354)

In [38]:
## ValueError: Found input variables with inconsistent numbers of samples: [126738, 66788]
# 1124 check_consistent_length(X, y)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train_ups_processed, y_trainset_ups)
print('TrainSet = ',clf.score(X_train_ups_processed, y_trainset_ups))
print('TestSet = ',clf.score(X_test_processed, y_test))

TrainSet =  0.6244457068913822
TestSet =  0.6038638904415875


In [None]:
# If the score is close to each other, it is working pretty well. 

In [46]:
X_train_set_clean_num = X_train_set_clean.select_dtypes(np.number)

In [47]:
X_train_set_clean_cat = X_train_set_clean.select_dtypes(include=object)

In [None]:
# DO I NEED THE TEST SPLIT HERE AGAIN BEFORE SCALE AND ENCODE?

In [49]:
#SCALATION. 
transformer_2 = StandardScaler().fit(X_train_set_clean_num)
x_standardized = transformer_2.transform(X_train_set_clean_num)
X_train_set_clean_num_scaled = pd.DataFrame(x_standardized, columns= X_train_set_clean_num.columns)

In [52]:
X_train_set_clean_cat.reset_index(drop=True, inplace=True)

In [53]:
#ENCODED.
encoder2 = OneHotEncoder(drop='first').fit(X_train_set_clean_cat)
encoded_categorical = encoder2.transform(X_train_set_clean_cat).toarray()
X_train_set_clean_cat_encoded = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out())

ValueError: Shape of passed values is (86786, 31), indices imply (86786, 24)

In [54]:
X_train_processed_final = pd.concat([X_train_set_clean_num_scaled, X_train_set_clean_cat_encoded], axis=1)

NameError: name 'X_train_set_clean_cat_encoded' is not defined

#### Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

In [None]:
# I am goint to use just numerical because in the lesson Jan applied just numericals. 
# Maybe because in that dataset has just numericals? 

In [55]:
from sklearn.feature_selection import VarianceThreshold
var_threshold = 0.02

# by default var_treshold is 0. But we want to remove the rows are constant and the ones are almost constant too.   
# In practise we would scale, the columns first, and then apply threshold, or apply different treshold for different columns.
sel = VarianceThreshold(threshold=var_threshold) 

#### two parameters: threshold and variance_threshold.
### threshold parameter is the value that will be used to determine whether or not a feature should be included in the model.          
## which means that any features with an absolute difference between their mean and standard deviation greater than 
# 0.02 (at least 2% of variation among all features) are not included in the model, they will be removed.

sel = sel.fit(X_train_set_num_scaled) 
### Reviewing the lesson. I nor sure if I should to insert just the train or whole numericals scales 
## with a variable that concat. X_train_num_scaled + X_test_num_scaled. 
# However after encode categorical they are al well numbers, so....I am confused.

## Next, we create another variable called temp using sel as its input function and 
# then transform our training data into a DataFrame object using pd-learn library functions like fit() and transform().

temp = sel.transform(X_train_set_num_scaled)
temp = pd.DataFrame(temp)

print(X_train_set_num_scaled.shape)
print(temp.shape)

NameError: name 'X_train_set_num_scaled' is not defined

In [None]:
pd.DataFrame(temp).head()

In [None]:
#sel.variances_ > var_threshold
#sel.get_support()
#var_list = list(sel.get_support())
#len(var_list)
##  The code would return a list of all the variances that are less than or equal to the threshold.
# I don't understand this section. 

In [None]:
#len(X_train_set_num_scaled.columns)

In [None]:
#len(list(zip(X_train_set_num_scaled.columns,var_list)))

In [None]:
#drop_list = [col[0] for col in zip(X_train_set_num_scaled.columns,var_list) if col[1] == False]
#len(drop_list)

##The code attempts to be used to find a list of columns with support that 
# are greater than the variable threshold.

In [None]:
# I WILL DROP MANUALLY THE SAME COLUMNS THAN IN THE ERIN NOTEBOOK. 

In [None]:
#WHICH COLULMNS ARE IN MY DATASET?
column_names = ['OSOURCE', 'SOLIH', 'VETERANS', 'ZIP', 'Unnamed: 0']

missing_columns = []
for column in column_names:
    if column not in Donors.columns:
        missing_columns.append(column)

if len(missing_columns) == 0:
    print("All columns are present in the dataset.")
else:
    print("The following columns are missing from the dataset:")
    print(missing_columns)

In [None]:
X_train_set_num2 = X_train_set_num.drop(drop_list, axis = 1)
X_train_processed2

In [None]:
X_test_processed2 = X_test_processed.drop(drop_list, axis = 1)
X_test_processed2.

In [None]:
col_to_drop = ['HVP1','HVP2','HVP3','HVP5','HVP6','HV2']

In [None]:
X_train = X_train_all.drop(['HVP1','HVP2','HVP3','HVP5','HVP6','HV2'], axis = 1)
X_train.shape
X_test = X_test_all.drop(['HVP1','HVP2','HVP3','HVP5','HVP6','HV2'], axis = 1)
X_test

## Looking at multicolinearity

In [None]:
Xy_train= pd.concat([X_train_processed2, y_train], axis=1)
Xy_train.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = Xy_train.corr(method = 'pearson')
fig, ax = plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr_matrix, annot = True)
plt.show()

In [None]:
X_train.columns

In [None]:
X_test.columns

#### Re-run the Random Forest algorithm to determine if the Feature Selection has improved the results.

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print('TrainSet = ',clf.score(X_train, y_train))
print('TestSet = ',clf.score(X_test, y_test))

In [None]:
y_train.value_counts()

In [None]:
plot_confusion_matrix(clf, X_test, y_test,cmap=plt.cm.Blues)  
plt.show()

In [None]:
X_train_num_scaled = X_train_num_scaled.drop(drop_list, axis = 1)
X_train_num_scaled

In [None]:
X2 = pd.concat([X_train_processed,y_trainset_ups], axis=1)
X2.reset_index(drop=True, inplace=True)

In [None]:
X = Donors.drop(['TARGET_B','TARGET_D'], axis = 1) #boston features
y = Donors['TARGET_B'] #Labels 

In [None]:
X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object) 

In [None]:
#ONE HOT ENCODED
encoded_categorical = encoder.transform(X_cat).toarray()
X_cat_encoded = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out()) 

In [None]:
#
x_standardized = transformer.transform(X_num)
X_num_scaled = pd.DataFrame(x_standardized, columns=X_num.columns)

In [None]:
X_trans = pd.concat([X_cat_encoded,X_num_scaled], axis=1)

In [None]:
print('TrainSet = ',clf.score(X_trans, y))

In [None]:
predictions = clf.predict(X_trans) # Here I am using the random forest. 
predictions

In [None]:
X_trans['Predictions'] = predictions

In [None]:
X_trans

In [None]:
# Run the random fetures #selecting feature by treshold #rerun random fores 

#### Discuss the output and its impact in the business scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?