# <font size='10' color='#1ABC9C'> Feature Selection and Model Training </font>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
## When using the 'inline' backend, 
## your matplotlib graphs will be included in your notebook, next to the code.
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import statsmodels.formula.api as smf
import statsmodels.api as sm
import itertools
import time

pd.pandas.set_option('display.max_columns',None)

In [2]:
dataset=pd.read_csv('dataset_cleaned_assumption_OK.csv')
print(dataset.shape)

(3922, 14)


In [3]:
dataset.head()

Unnamed: 0,yrsempl,income,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue,2,3,4,5
0,0.60206,1.230449,-0.43002,0.112096,0,4,1,22,0.0,0.959041,0,0,0,0
1,1.176091,1.079181,-0.42533,-0.406218,0,3,1,28,0.0,0.770852,0,0,0,0
2,0.845098,0.954243,-0.696869,-0.102906,0,5,0,21,0.0,0.763428,0,0,0,0
3,1.544068,1.20412,-0.502938,-0.120386,0,4,0,11,0.0,0.763428,0,0,0,0
4,0.90309,1.568202,-0.844087,-0.385303,0,3,0,11,0.0,1.344392,0,0,0,0


In [4]:
y = dataset['income'].copy()
X = dataset.drop(['income'],axis=1).copy()

In [5]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Selecting the best Subset for model training 
Algorithm 1 

#### Reference = https://xavierbourretsicotte.github.io/subset_selection.html

In [6]:
def fit_linear_reg(X,Y,X_test):
    #Fit linear regression model and return RSS and R squared values
    model_k = linear_model.LinearRegression(fit_intercept = True)
    model_k.fit(X,Y)
    list_of_features = [feature for feature in X.columns]
    X_test = X_test[list_of_features].copy()
        
  # print(Y.shape,Y_test.Shape)
    RSS = mean_squared_error(y_test,model_k.predict(X_test)) * len(y_test)
    R_squared = model_k.score(X_test,y_test)
    return RSS, R_squared

from tqdm import tnrange, tqdm_notebook

#Initialization variables
Y = y_train.copy()
X = X_train.copy()
k = 11
RSS_list, R_squared_list, feature_list = [],[], []
numb_features = []

#Looping over k = 1 to k = 11 features in X
for k in tnrange(1,len(X.columns) + 1, desc = 'Loop...'):

    #Looping over all possible combinations: from 11 choose k
    for combo in itertools.combinations(X.columns,k):
        tempX = X_test.copy()
        tmp_result = fit_linear_reg(X[list(combo)],Y,tempX)   #Store temp result 
        RSS_list.append(tmp_result[0])                  #Append lists
        R_squared_list.append(tmp_result[1])
        feature_list.append(combo)
        numb_features.append(len(combo))   

#Store in DataFrame
df = pd.DataFrame({'numb_features': numb_features,'RSS': RSS_list, 'R_squared':R_squared_list,'features':feature_list})


  for k in tnrange(1,len(X.columns) + 1, desc = 'Loop...'):


Loop...:   0%|          | 0/13 [00:00<?, ?it/s]

In [21]:
df['min_RSS'] = df.groupby('numb_features')['RSS'].transform(min)
df['max_R_squared'] = df.groupby('numb_features')['R_squared'].transform(max)
for i in range (0,len(df)):
    if df['R_squared'][i]> 0.9253:
        print(i , " >>> " ,df['R_squared'][i]," - ",df['features'][i]) 

4808  >>>  0.9253164830722064  -  ('yrsempl', 'othdebt', 'carvalue', '2', '3', '4', '5')
6482  >>>  0.9253333505953888  -  ('yrsempl', 'othdebt', 'cars', 'carvalue', '2', '3', '4', '5')


### With above features in subset , we tend to get a model that can have accuracy more than 90 %

# We'll will proceed to model testing, before finalizing our Final model 

In [23]:
X_train.to_csv('X_train.csv',index=False) 
X_test.to_csv('X_test.csv',index=False)
y_train.to_csv('y_train.csv',index=False)
y_test.to_csv('y_test.csv',index=False)