In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
numerical=pd.read_csv('numerical.csv')
categorical=pd.read_csv('categorical.csv')
target=pd.read_csv('target.csv')

In [3]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TARGET_B  95412 non-null  int64  
 1   TARGET_D  95412 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [4]:
data=pd.concat([numerical, categorical, target], axis=1)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Columns: 339 entries, TCODE to TARGET_D
dtypes: float64(10), int64(322), object(7)
memory usage: 246.8+ MB


In [6]:
data['TARGET_B'].value_counts()

# imbalance in data

0    90569
1     4843
Name: TARGET_B, dtype: int64

# imbalance management

In [7]:
category_0=data[data['TARGET_B']==0].sample(len(data[data['TARGET_B']==1]))

In [8]:
category_1=data[data['TARGET_B']==1]

In [9]:
data=pd.concat([category_0, category_1], axis=0)

In [10]:
data.shape

(9686, 339)

In [11]:
data=data.reset_index(drop=True)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9686 entries, 0 to 9685
Columns: 339 entries, TCODE to TARGET_D
dtypes: float64(10), int64(322), object(7)
memory usage: 25.1+ MB


In [13]:
#set dependent variable
y=data['TARGET_B']
X=data.drop(['TARGET_B'],axis=1)

In [14]:
X_num=X.select_dtypes(np.number)
X_cat=X.select_dtypes(object)

In [15]:
# encoding

encoder = OneHotEncoder(drop='first').fit(X_cat)

In [16]:
X_cat_en=encoder.transform(X_cat).toarray() # lost categorical headers

In [17]:
X_cat_en_cols=encoder.get_feature_names_out(X_cat.columns)

In [18]:
X_cat_en_cols

array(['STATE_FL', 'STATE_GA', 'STATE_IL', 'STATE_IN', 'STATE_MI',
       'STATE_MO', 'STATE_NC', 'STATE_TX', 'STATE_WA', 'STATE_WI',
       'STATE_other', 'HOMEOWNR_U', 'GENDER_M', 'GENDER_other',
       'RFA_2A_E', 'RFA_2A_F', 'RFA_2A_G', 'GEOCODE2_B', 'GEOCODE2_C',
       'GEOCODE2_D', 'DOMAIN_A_R', 'DOMAIN_A_S', 'DOMAIN_A_T',
       'DOMAIN_A_U'], dtype=object)

In [21]:
X_cat_en_df=pd.DataFrame(X_cat_en, columns = X_cat_en_cols) # restore categorical headers

In [22]:
X_cat_en_df

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [23]:
X_full=pd.concat([X_num, X_cat_en_df], axis=1) # normaly we would lose numerical headers... but as we kept the categoricals, we keep the numericals

In [27]:
X_full.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0,61.611649,5,9,0,0,19,28,42,6,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0,50.0,3,9,0,0,23,18,33,13,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1,46.0,5,9,0,0,33,23,63,10,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,28,61.611649,5,9,0,0,36,23,47,10,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,28,52.0,5,3,17,1,15,36,0,9,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test=train_test_split(X_full,y,test_size=0.3, random_state=40)

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
clf=RandomForestClassifier(n_estimators=200, max_depth=2, random_state=40)
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))

0.983826565726084


# with ensemble you should use more than one scoring metric

In [31]:
from sklearn.model_selection import cross_val_score

In [32]:
cross_val_scores=cross_val_score(clf, X_train, y_train)

In [33]:
cross_val_scores

array([0.97271386, 0.97640118, 0.97418879, 0.97713864, 0.9800885 ])

# feature selection with random forest



In [36]:
feature_importances=clf.feature_importances_

In [37]:
feature_names = X_full.columns

In [38]:
forest_importances=pd.Series(feature_importances, index=feature_names)

In [39]:
forest_importances

TCODE         0.000000
AGE           0.003974
INCOME        0.000000
WEALTH1       0.000000
HIT           0.000000
                ...   
GEOCODE2_D    0.000000
DOMAIN_A_R    0.000000
DOMAIN_A_S    0.000000
DOMAIN_A_T    0.000000
DOMAIN_A_U    0.000000
Length: 355, dtype: float64

In [41]:
with pd.option_context('display.max_rows', None):
    print (forest_importances.sort_values(ascending=False))

TARGET_D        0.195336
RFA_2F          0.065136
CARDGIFT        0.061522
NGIFTALL        0.052254
MINRAMNT        0.049803
AVGGIFT         0.044058
MAXRAMNT        0.043755
LASTGIFT        0.040794
RAMNTALL        0.025018
CARDPROM        0.019810
NUMPROM         0.018933
NUMPRM12        0.017792
HHN2            0.016191
RFA_2A_G        0.016131
MAXRDATE_YR     0.014536
CONTROLN        0.011950
FIRSTDATE_YR    0.010566
LASTDATE_YR     0.009760
RFA_2A_F        0.009741
HV1             0.009617
TIMELAG         0.009030
HV2             0.008992
ODATEW_YR       0.008265
LASTDATE_MM     0.008225
HVP6            0.007085
ETH1            0.006965
DMA             0.005895
HVP3            0.005605
ETHC5           0.005558
EC7             0.005149
POBC2           0.004995
CARDPM12        0.004702
WWIIVETS        0.004616
IC2             0.004539
HVP4            0.004474
AGE             0.003974
EC6             0.003792
ETHC4           0.003735
RFA_2A_E        0.003642
ETHC3           0.003587


In [42]:
Thiago = [7., 7., 7., 8., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 8.,
       7., 7., 7., 8., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 8., 7., 7., 7., 8., 8., 7., 8., 7., 7., 7., 7., 8., 7., 8., 7.,
       7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 7., 8., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 8., 7., 7., 8., 7., 7., 8., 7., 8., 7., 7., 7., 7.,
       8., 7., 7., 7., 7., 7., 8., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 8., 8., 7., 7., 8., 8., 7., 7., 7., 8., 7., 7., 7., 8., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 8., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 8., 7., 7., 7., 8., 7., 7.,
       8., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 7., 8., 7., 7., 8., 7., 8., 8., 7., 7., 7., 8., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 8., 7., 7., 7., 8., 7., 8., 8., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 8., 7., 7., 7.,
       8., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7.,
       7., 7., 7., 7., 7., 8., 8., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7.,
       7., 8., 7., 8., 8., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7.,
       8., 8., 8., 8., 7., 7., 7., 7., 8., 7., 8., 7., 7., 7., 8., 8., 8.,
       7., 7., 7., 7., 8., 7., 8., 7., 7., 7., 7., 7., 7., 8., 8., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7.,
       7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7., 8., 7.,
       7., 7., 7., 7., 7., 7., 8., 8., 8., 7., 7., 8., 7., 7., 7., 7., 7.,
       8., 8., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 8., 7., 7.,
       7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 8.,
       7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8.,
       8., 7., 8., 7., 8., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7.,
       8., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 8., 7.,
       7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7.,
       8., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 8., 8., 7., 7., 7., 8., 7., 7., 7., 7., 7.,
       8., 7., 7., 7., 8., 8., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 8., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 8., 7., 7., 7., 7., 7., 7., 7., 8., 8., 7., 7., 7., 7., 7., 8.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 8., 8., 8., 7., 7.,
       7., 8., 8., 7., 8., 7., 7., 7., 7., 8., 7., 7., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
       7., 7., 7., 7., 7., 7., 7., 7., 7., 6., 6., 6., 6., 6., 6., 6., 6.,
       6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.,
       6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.,
       6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.,
       6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.]

In [55]:
print(Thiago.count(8), Thiago.count(7), Thiago.count(6))

120 586 73


TypeError: unsupported operand type(s) for -: 'str' and 'int'