In [2]:
#importing packages 

#Packages related to general operating system & warnings
import os 
import warnings
warnings.filterwarnings('ignore')

#Packages related to data importing, manipulation, exploratory data analysis, data understanding
import numpy as np
import pandas as pd
import pandas_profiling
import scipy.stats as stats

#Packages related to data visualizaiton
import seaborn as sns
import matplotlib.pyplot as plt

#Modules related to split the data & gridsearch
from sklearn.model_selection import train_test_split, GridSearchCV

#Module related to calculation of metrics
from sklearn import metrics

#Modules related key techniques of supervised learning 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

#Modules  to scaling the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
#importing data
Credit=pd.read_excel('CreditConsumptionData.xlsx')
Behavior=pd.read_excel('CustomerBehaviorData.xlsx')
Demographics=pd.read_excel('CustomerDemographics.xlsx')

In [4]:
#cheacking data information :
#Credit.info()
#Behavior.info()
#Demographics.info()

#Credit.describe()
#Behavior.describe()
#Demographics.describe()

In [5]:
#cheacking data types :
#Credit.dtypes
#Behavior.dtypes
#Demographics.dtypes

In [6]:
#cheacking null value :
#Credit.isna().sum()
#Behavior.isna().sum()
#Demographics.isna().sum() 

In [7]:
#merging data
Customer=Credit.merge(Behavior.merge(Demographics,on='ID'),on='ID')

In [8]:
Customer.head(5)

Unnamed: 0,ID,cc_cons,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,cc_cons_jun,dc_cons_jun,cc_count_apr,cc_count_may,...,emi_active,account_type,gender,age,Income,Emp_Tenure_Years,Tenure_with_Bank,region_code,NetBanking_Flag,Avg_days_between_transaction
0,12554,20014.0,24893.0,378.0,10288.0,29664.0,16291.4,11432.0,2.0,26,...,1674.09,current,M,35,MEDIUM,15.0,1,708.0,0,17.0
1,17645,10173.0,18941.62,966.0,20672.0,287.0,4217.0,885.0,1.0,7,...,13043.34,current,M,35,HIGH,15.0,6,123.0,0,14.0
2,7604,16095.0,5678.87,2724.0,1964.5,3933.11,23956.25,5168.0,43.0,32,...,25375.27,current,M,55,MEDIUM,24.5,1,802.0,1,3.0
3,1590,7707.0,30489.5,1236.0,12609.88,9138.14,17521.0,13650.3,53.0,1,...,3544.33,current,F,29,MEDIUM,4.5,6,867.0,1,4.0
4,16556,96408.0,7441.4,6906.04,4364.0,1939.0,2121.0,6829.18,67.0,47,...,12780.44,current,M,34,MEDIUM,12.6,4,802.0,1,7.0


In [9]:
#Customer.isna().sum()

In [10]:
#seprating X-Y variable based on null Target variable  
Customer_0=Customer[Customer.cc_cons.isna()==0]
Customer_1=Customer[Customer.cc_cons.isna()==1]

In [None]:
encoding=Customer_0[Customer_0.columns[Customer_0.dtypes==object]]

#One Hot Encoding:(method-1)
one_hot=pd.get_dummies(encoding, columns = ['loan_enq', 'account_type', 'gender', 'Income'])

#One Hot Encoding using Sci-kit learn Library: (method-2)
#from sklearn.preprocessing import OneHotEncoder
#Create an instance of One-hot-encoder
#enc=OneHotEncoder()
#enc_data=pd.DataFrame(enc.fit_transform(encoding[['loan_enq', 'account_type', 'gender', 'Income']]).toarray())

In [50]:
#Merge with main
NewCustomer_0=Customer_0.join(one_hot)

In [80]:
NewCustomer_0=NewCustomer_0.apply(lambda x : missing_treatment(x))
NewCustomer_0=NewCustomer_0.apply(lambda x : outlier_capping(x))

AttributeError: 'Series' object has no attribute 'quntile'

In [85]:
NewCustomer_0.columns[NewCustomer_0.corrwith(NewCustomer_0.cc_cons, axis=0, drop=False, method='pearson')>0.001]

Index(['ID', 'cc_cons', 'cc_cons_apr', 'cc_cons_may', 'dc_cons_jun',
       'cc_count_may', 'dc_count_may', 'investment_1', 'investment_2',
       'investment_4', 'debit_amount_apr', 'credit_amount_apr',
       'credit_count_may', 'max_credit_amount_may', 'credit_count_jun',
       'max_credit_amount_jun', 'Tenure_with_Bank', 'NetBanking_Flag',
       'account_type_saving', 'gender_M', 'Income_HIGH'],
      dtype='object')

In [88]:
NewCustomer_0.cc_cons

0         20014.0
1         10173.0
2         16095.0
3          7707.0
4         96408.0
           ...   
14995      4263.0
14996     11019.0
14997    404227.0
14998     28813.0
14999     16252.0
Name: cc_cons, Length: 15000, dtype: float64

In [92]:
#create a box plot
sns.boxplot(x=NewCustomer_0.cc_cons,y=NewCustomer_0.cc_cons)

<AxesSubplot:xlabel='cc_cons', ylabel='cc_cons'>

In [None]:
#There are three types of feature selection: 
#1.    Wrapper methods- (forward, backward, and stepwise selection), 
#2.    Filter methods- (ANOVA, Pearson correlation, variance thresholding), 
#3.    Embedded methods- (Lasso, Ridge, Decision Tree)

In [None]:
#Feature engineering tech
#Feature Selection
#less variance feature reduction
l =[]
for i in x.columns:
    if (x[i].mean()==0):
        cv=0
    else:
        cv=x[i].std()/x[i].mean()
    if (cv<0.05):
        l.append(i)
    print(i, cv)
X_new = x[x.columns.difference(l)]
l

In [None]:
#assigning x-y(Target)
y = x.cc_cons
X= x[x.columns.difference(['cc_cons'])]
#Split the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=100,test_size=0.3)   

In [None]:
#Finding the correlated features
def correlation(dataset, threshold):
    col_corr = set()  
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i] 
                col_corr.add(colname)
    return col_corr      

In [None]:
corr_features = correlation(X_train, 0.7)

In [None]:
#Chi-square
#Perform chi2 test
from sklearn.feature_selection import chi2
#Calculating Fscore and p value
f_p_values=chi2(X_train,y_train)
F_p_values
#data is not non-negative, maybe chi2 is not a good method.

In [None]:
#Mutual information gain
#Importing mutual information gain
from sklearn.feature_selection import mutual_info_regression
# determine the mutual information
mutual_info = mutual_info_regression(X_train, y_train)
#Representing in list form
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
#mutual_info.sort_values(ascending=False)
#Plotting the graph - plot the ordered mutual_info values per feature
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20,8))
plt.show()

In [None]:
#Selecting best N features
from sklearn.feature_selection import SelectKBest
#No we Will select the top 5 important features
sel_five_cols = SelectKBest(mutual_info_regression, k=15)
sel_five_cols.fit(X_train, y_train)
X_train.columns[sel_five_cols.get_support()]

In [None]:
#Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
#from sklearn.ensemble import RandomForestRegressor
estimator=SVR(kernel="linear")
r = RFE(estimator,n_features_to_select=15,importance_getter='auto')
r = r.fit(x, y)
#list([r.support_])
r.support_

In [None]:
#Feature selection
#https://scikit-learn.org/stable/modules/feature_selection.html
#.1. Removing features with low variance
#.2. Univariate feature selection
#.3. Recursive feature elimination
#.4. Feature selection using SelectFromModel
#.5. Sequential Feature Selection
#.6. Feature selection as part of a pipeline

In [None]:
###Filter methods 

#--------------------------------------------------------------------------------

#1.Information Gain:
import sklearn.feature_selection as feature_selection
#Importing mutual information gain
#The higher the value the more important that feature will be or you can say that the dependency of that independent feature will be more on the dependent feature.
# determine the mutual information
#mutual_info_regression:
mutual_info=feature_selection.mutual_info_regression(X_train,y_train,n_neighbors=3)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
#r_regression:
r_regression=feature_selection.r_regression(X_train,y_train)
r_regression = pd.Series(r_regression)
r_regression.index = X_train.columns
#compering [mutual_info-r_regression]:
df1 = pd.DataFrame(mutual_info).reset_index()
df2 = pd.DataFrame(r_regression).reset_index()
Information_Gain=pd.merge(df1,df2,on='index')
Information_Gain[['col_name','mutual_info','r_regression']]=pd.merge(df1,df2,on='index')
col=Information_Gain.columns.drop(['index','0_x','0_y'])
Information_Gain=Information_Gain[col]
#Information_Gain
#--------------------------------------------------------------------------------
#2.Chi-square:
#Perform chi2 test
from sklearn.feature_selection import chi2
#Calculating Fscore and p value
#f_p_values=chi2(X_train,y_train)
#F_p_values
#data is not non-negative, maybe chi2 is not a good method.


#--------------------------------------------------------------------------------

#3.Fisher’s Score:
#!pip install skfeature-chappers
from skfeature.function.similarity_based import fisher_score
#fisher_score=fisher_score.fisher_score(X_train,y_train)

#--------------------------------------------------------------------------------

#4.Correlation Coefficient:
#correlation coefficients 3 method:
#i.pearsonr() [num-num]
pearson=X_train.corrwith(y_train,method='pearson').reset_index()
#ii.spearmanr() [cat(ord)-num]
spearman=X_train.corrwith(y_train,method='spearman').reset_index()
#iii.kendalltau() [cat(ord)-num(Conti)]
kendall=X_train.corrwith(y_train,method='kendall').reset_index()

#correlation=pd.merge(pearson,spearman,on='index')
correlations=pd.merge(pd.merge(pearson,spearman,on='index'),kendall,on='index')
correlations=correlations[['col_name','pearson','spearman','kendall']]=pd.merge(pd.merge(pearson,spearman,on='index'),kendall,on='index')
col=correlations.columns.drop(['index','0_x','0_y',0])
correlations=correlations[col]
#correlations

#--------------------------------------------------------------------------------

#5.Variance Threshold:
selector=feature_selection.VarianceThreshold()
transf=selector.fit_transform(X_train,y_train)
Variance_Threshold=pd.DataFrame(transf,columns=X_train.columns).T

#--------------------------------------------------------------------------------

#6.Mean Absolute Difference (MAD)-[X_train]
Mean_Absolute_Difference=np.sum(np.abs(X_train-np.mean(X_train,axis=0)))/X_train.shape[0]
Mean_Absolute_Difference=Mean_Absolute_Difference.reset_index()
Mean_Absolute_Difference.rename(columns = {'index':'col_name',0:'MAD'}, inplace = True)
#Mean_Absolute_Difference-[y_train]
Mean_Absolute_Difference_y=np.sum(np.abs(y_train-np.mean(y_train,axis=0)))/y_train.shape[0] #-[y_train]

#--------------------------------------------------------------------------------

#7.Dispersion ratio = [arithmetic_mean (AM) /geometric_mean (GM)]
XX=X_train+1 #to avoid 0 drnominator we add 1 here
AM=np.mean(XX,axis=0)
GM=np.power(np.prod(XX,axis=0),1/XX.shape[0])
Dispersion_ratio=AM/GM
Dispersion_ratio=Dispersion_ratio.reset_index()
Dispersion_ratio.rename(columns = {'index':'col_name',0:'Dispersion_ratio'}, inplace = True)
#Dispersion_ratio

#--------------------------------------------------------------------------------

#8.ANOVA
#stats.f_oneway(X_train,y_train)
#--------------------------------------------------------------------------------

Filter=pd.merge(pd.merge(Information_Gain,correlations,on='col_name'),Dispersion_ratio,on='col_name')
