In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
from time import time

In [2]:
#Read in data
t0 = time()
data = pd.read_csv('train_values_prepped.csv')
labels = pd.read_csv('train_labels_prepped.csv')


In [3]:
numerical = ['loan_amount']
num = data[numerical]


In [4]:
#Impute missing values for loan_amount
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = np.nan, strategy = 'most_frequent')
imp = imp.fit(num)
num_imputed = imp.transform(num)
num_imputed = pd.DataFrame(data=num_imputed, columns=numerical)
num = num_imputed


In [5]:
#Scales the data. Makes individual scalers for each numerical column
from sklearn.preprocessing import MinMaxScaler
scalers = {}
for col in num.columns:
    scaler = MinMaxScaler()
    temp = num[col]
    temp = np.array(temp)
    temp = temp.reshape(-1, 1)
    scaler.fit(temp)
    num_scaled = scaler.transform(temp)
    scalers[col] = scaler
    num[col] = num_scaled
    
num_scaled = num
print(num_scaled.mean(axis=0))
print(num_scaled.std(axis=0))

loan_amount    0.345986
dtype: float64
loan_amount    0.212585
dtype: float64


In [6]:
#Encodes the categorical features puts them into cat_features

data['property_type'][0] = 3


cat = ['property_type','loan_purpose','occupancy','preapproval','applicant_ethnicity','applicant_race','applicant_sex',
      'co_applicant']

def encode_string(cat_feature):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    enc_cat_feature = enc.transform(cat_feature)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
    return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
    
cat_features = encode_string(data['loan_type'])
for col in cat:
    temp = encode_string(data[col])
    cat_features = np.concatenate([cat_features, temp], axis = 1)
    print(str(col) + " " + str(temp.shape))

data['property_type'][0] = 1

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


property_type (191426, 3)
loan_purpose (191426, 3)
occupancy (191426, 3)
preapproval (191426, 3)
applicant_ethnicity (191426, 4)
applicant_race (191426, 7)
applicant_sex (191426, 4)
co_applicant (191426, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
#Concats numerical scaled features and cat_features. Then uses kbest to select the best 16
all_features = np.concatenate([cat_features, num_scaled], axis = 1)

y = labels['rate_spread']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bestfeatures = SelectKBest(score_func=chi2, k=15)
fit = bestfeatures.fit(all_features, y)
dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(numerical)
featureScores = pd.concat([dfscores], axis=1)
featureScores.columns = ['Score']
featureScores = featureScores.round(3)
print(featureScores)
print(featureScores.nlargest(15,'Score'))  #print 10 best features
Features_reduced = fit.transform(all_features)
print(Features_reduced.shape)

        Score
0   25023.661
1   19447.141
2     239.789
3     860.994
4    9267.669
5   48731.214
6     279.382
7    1772.746
8   12772.889
9    2102.274
10     27.762
11    404.318
12     86.377
13   1264.106
14   6718.921
15   2547.010
16   1045.627
17    534.360
18   8261.287
19     61.903
20    302.082
21    114.095
22    404.256
23     32.172
24    144.379
25   1398.455
26     62.109
27     64.268
28    103.877
29    801.604
30    112.995
31    422.173
32    697.318
33   4500.966
        Score
5   48731.214
0   25023.661
1   19447.141
8   12772.889
4    9267.669
18   8261.287
14   6718.921
33   4500.966
15   2547.010
9    2102.274
7    1772.746
25   1398.455
13   1264.106
16   1045.627
3     860.994
(191426, 15)


In [8]:
#Reduces features using the SelectKBest algorithm to 15
Features_reduced = fit.transform(all_features)
print(Features_reduced.shape)

(191426, 15)


In [9]:
#Splits data into train and test data
import numpy.random as nr

nr.seed(9988)
labels = y
indx = range(Features_reduced.shape[0])
indx = ms.train_test_split(indx, test_size = 40000)
x_train = Features_reduced[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features_reduced[indx[1],:]
y_test = np.ravel(labels[indx[1]])




In [None]:
#Creates a DecisionTreeRegressor model fits 
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
from sklearn import svm

# lin_mod = linear_model.LinearRegression(fit_intercept = False)
# lin_mod = tree.DecisionTreeRegressor(min_samples_split=500)
# lin_mod = GradientBoostingRegressor()
# lin_mod = linear_model.Ridge(alpha=.5)
# lin_mod = linear_model.BayesianRidge()
# x_train = x_train[:len(x_train)/10]
# y_train = y_train[:len(y_train)/10]
# lin_mod = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
lin_mod = svm.SVR()
lin_mod.fit(x_train, y_train)


  from numpy.core.umath_tests import inner1d


In [None]:
#Prints the r2 values
import math

def print_metrics(y_true, y_predicted, n_parameters):
    ## First compute R^2 and the adjusted R^2
    r2 = sklm.r2_score(y_true, y_predicted)
    r2_adj = r2 - (n_parameters - 1)/(y_true.shape[0] - n_parameters) * (1 - r2)
    
    ## Print the usual metrics and the R^2 values
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))
    print('Adjusted R^2           = ' + str(r2_adj))
   
y_score = lin_mod.predict(x_test) 
print_metrics(y_test, y_score, 28)    

In [None]:
def hist_resids(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    sns.distplot(resids)
    plt.title('Histogram of residuals')
    plt.xlabel('Residual value')
    plt.ylabel('count')
    
hist_resids(y_test, y_score)    

In [None]:
import scipy.stats as ss
def resid_qq(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    ss.probplot(resids.flatten(), plot = plt)
    plt.title('Residuals vs. predicted values')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual')
    
resid_qq(y_test, y_score)   

In [None]:
def resid_plot(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    sns.regplot(y_score, resids, fit_reg=False)
    plt.title('Residuals vs. predicted values')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual')

resid_plot(y_test, y_score) 

In [None]:
y_score_untransform = np.exp(y_score)
y_test_untransform = np.exp(y_test)
resid_plot(y_test_untransform, y_score_untransform) 

In [None]:
def plot_scatter(data, col_x, col_y):
    fig = plt.figure(figsize=(7,6))
    ax = fig.gca()
    data.plot.scatter(x = col_x, y = col_y, ax = ax)
    ax.set_title('Scatter plot of ' + 'predicted' + ' vs. ' + 'actual')
    ax.set_ylabel('predicted')
    ax.set_xlabel('actual')

    plt.plot(range(15))
    plt.xlim(0, 15)
    plt.ylim(0, 15)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.draw()

    plt.show()

df = pd.DataFrame(y_test)
df['y_score'] = y_score
df['y_test'] = y_test

plot_scatter(df,'y_score','y_test')


In [None]:
#Gets Test values and scales the numerical columns
test = pd.read_csv('test_values.csv')
print(test.isna().sum())

test_num = test[numerical]
for col in test_num.columns:
    null_values = test_num[col].isnull()
    test_num.loc[~null_values, [col]] = scalers[col].transform(test_num.loc[~null_values, [col]] )


In [None]:
#Imputes values for categorical variables and creates categorical features
test_cat_features = encode_string(test['loan_type'])
for col in cat:
    temp = encode_string(test[col])
    test_cat_features = np.concatenate([test_cat_features, temp], axis = 1)
    print(str(col) + " " + str(temp.shape))

    
# test_all_features = np.concatenate([test_cat_features,test_num_scaled],axis = 1)

In [None]:
#imputes for numerical variables
test_num_scaled = test_num
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = np.nan, strategy = 'mean')
imp = imp.fit(test_num_scaled)
test_num_scaled = imp.transform(test_num_scaled)

In [None]:
test_all_features = np.concatenate([test_cat_features,test_num_scaled],axis = 1)
print(test_all_features.shape)

In [None]:
test_all_features = np.concatenate([test_cat_features,test_num_scaled],axis = 1)
Test_Features_reduced = fit.transform(test_all_features)
print(Test_Features_reduced.shape)

In [None]:
#Makes predictions and outputs it into a csv file

# Test_Features_reduced = sel.fit_transform(Test_Features_reduced)

predictions = lin_mod.predict(Test_Features_reduced) 

df = pd.DataFrame(predictions)
ids = pd.DataFrame(test['row_id'])
ids['rate_spread'] = df
ids.to_csv('predictions.csv', index = None, header=True)
print("training time:", round(time() - t0, 3))
