In [143]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error

In [144]:
# Read from csv.
train_data = pd.read_csv('./predict_student_loans/train_values.csv')
train_labels = pd.read_csv('./predict_student_loans/train_labels.csv')
test_data = pd.read_csv('./predict_student_loans/test_values.csv')

In [145]:
target = ['repayment_rate']

In [146]:
pd.options.display.max_rows = 4000

In [147]:
train_data.head().transpose()

Unnamed: 0,0,1,2,3,4
row_id,3,4,5,6,7
academics__program_assoc_agriculture,0,0,0,0,1
academics__program_assoc_architecture,0,0,0,0,0
academics__program_assoc_biological,0,0,0,0,0
academics__program_assoc_business_marketing,2,0,0,1,2
academics__program_assoc_communication,0,0,0,0,0
academics__program_assoc_communications_technology,0,0,0,0,1
academics__program_assoc_computer,0,0,0,1,2
academics__program_assoc_construction,0,0,0,0,0
academics__program_assoc_education,0,0,0,0,1


In [148]:
#train_data.report_year.unique()

### Determine data types

In [149]:
def get_type_lists(data_frame=train_data, rejects=['row_id']):

    """Creates lists of numeric and categorical variables.
    
    :param frame: The frame from which to determine types.
    :param rejects: Variable names not to be included in returned lists.
    :return: Tuple of lists for numeric and categorical variables in the frame.

    """
    nums, cats = [], []

    for col in data_frame.columns:
        if col not in rejects:
            type=data_frame[col].dtype
            if type in ('int64','int32','float64') :
                nums.append(col)
            elif str(type) in ('object')  :
                cats.append(col)
                
    print('Numeric =', nums)                
    print()
    print('Categorical =', cats)
    
    return nums, cats

In [150]:
original_nums, cats = get_type_lists()

Numeric = ['academics__program_assoc_agriculture', 'academics__program_assoc_architecture', 'academics__program_assoc_biological', 'academics__program_assoc_business_marketing', 'academics__program_assoc_communication', 'academics__program_assoc_communications_technology', 'academics__program_assoc_computer', 'academics__program_assoc_construction', 'academics__program_assoc_education', 'academics__program_assoc_engineering', 'academics__program_assoc_engineering_technology', 'academics__program_assoc_english', 'academics__program_assoc_ethnic_cultural_gender', 'academics__program_assoc_family_consumer_science', 'academics__program_assoc_health', 'academics__program_assoc_history', 'academics__program_assoc_humanities', 'academics__program_assoc_language', 'academics__program_assoc_legal', 'academics__program_assoc_library', 'academics__program_assoc_mathematics', 'academics__program_assoc_mechanic_repair_technology', 'academics__program_assoc_military', 'academics__program_assoc_multi

# Feature Engineering

In [151]:
# missing values with Zero
#test_data_x = test_data
for col in original_nums:
    train_data[col].fillna(0,inplace=True)
    test_data[col].fillna(0,inplace=True)

In [152]:
for col in cats:
    train_data[col].fillna('default',inplace=True)
    test_data[col].fillna('default',inplace=True)

In [153]:
train_data.shape, test_data.shape

((8705, 444), (6391, 444))

In [158]:
# Encoding categorical data in train set
for column in ['report_year', 'school__carnegie_basic', 'school__carnegie_size_setting', 'school__carnegie_undergrad', 'school__degrees_awarded_highest', 'school__degrees_awarded_predominant', 'school__institutional_characteristics_level', 'school__locale', 'school__main_campus', 'school__men_only', 'school__minority_serving_aanipi', 'school__minority_serving_annh', 'school__minority_serving_hispanic', 'school__minority_serving_historically_black', 'school__minority_serving_nant', 'school__minority_serving_predominantly_black', 'school__minority_serving_tribal', 'school__online_only', 'school__ownership', 'school__region_id', 'school__religious_affiliation', 'school__state', 'school__women_only']:
    dummies = pd.get_dummies(train_data[column],drop_first=True)
    train_data[dummies.columns] = dummies

In [159]:
# Encoding categorical data in test set
for column in ['report_year', 'school__carnegie_basic', 'school__carnegie_size_setting', 'school__carnegie_undergrad', 'school__degrees_awarded_highest', 'school__degrees_awarded_predominant', 'school__institutional_characteristics_level', 'school__locale', 'school__main_campus', 'school__men_only', 'school__minority_serving_aanipi', 'school__minority_serving_annh', 'school__minority_serving_hispanic', 'school__minority_serving_historically_black', 'school__minority_serving_nant', 'school__minority_serving_predominantly_black', 'school__minority_serving_tribal', 'school__online_only', 'school__ownership', 'school__region_id', 'school__religious_affiliation', 'school__state', 'school__women_only']:
    dummies = pd.get_dummies(test_data[column],drop_first=True)
    test_data[dummies.columns] = dummies

In [160]:
train_data.shape, test_data.shape

((8705, 650), (6391, 644))

In [161]:
#drop encoded categorical columns in train set
train_data=train_data.drop(['report_year', 'school__carnegie_basic', 'school__carnegie_size_setting', 'school__carnegie_undergrad', 'school__degrees_awarded_highest', 'school__degrees_awarded_predominant', 'school__institutional_characteristics_level', 'school__locale', 'school__main_campus', 'school__men_only', 'school__minority_serving_aanipi', 'school__minority_serving_annh', 'school__minority_serving_hispanic', 'school__minority_serving_historically_black', 'school__minority_serving_nant', 'school__minority_serving_predominantly_black', 'school__minority_serving_tribal', 'school__online_only', 'school__ownership', 'school__region_id', 'school__religious_affiliation', 'school__state', 'school__women_only'], axis=1)

In [162]:
#drop encoded categorical columns in test set
test_data=test_data.drop(['report_year', 'school__carnegie_basic', 'school__carnegie_size_setting', 'school__carnegie_undergrad', 'school__degrees_awarded_highest', 'school__degrees_awarded_predominant', 'school__institutional_characteristics_level', 'school__locale', 'school__main_campus', 'school__men_only', 'school__minority_serving_aanipi', 'school__minority_serving_annh', 'school__minority_serving_hispanic', 'school__minority_serving_historically_black', 'school__minority_serving_nant', 'school__minority_serving_predominantly_black', 'school__minority_serving_tribal', 'school__online_only', 'school__ownership', 'school__region_id', 'school__religious_affiliation', 'school__state', 'school__women_only'], axis=1)

In [163]:
train_data.shape, test_data.shape

((8705, 627), (6391, 621))

In [164]:
train_data = train_data.merge(train_labels, on='row_id', how='inner')

In [166]:
target = ['repayment_rate']
train_labels = train_labels[target]

In [168]:
# after using onehotencoder the features in train and test do not match because of different categories in both sets
# source: https://stackoverflow.com/questions/34170413/possible-ways-to-do-one-hot-encoding-in-scikit-learn
# get the columns in train that are not in test
col_to_add = np.setdiff1d(train_data.columns, test_data.columns)

# add these columns to test, setting them equal to zero
for c in col_to_add:
    test_data[c] = 0

# select and reorder the test columns using the train columns
test_data = test_data[train_data.columns]

In [169]:
#missing columns added
col_to_add

array(['Four-year, medium full-time, selective, lower transfer-in', 'das',
       'dyk', 'hdy', 'jbi', 'nre', 'qzo', 'repayment_rate', 'rgp', 'rvp',
       'spy', 'xds', 'zug'], dtype=object)

In [170]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [171]:
train_data.shape, test_data.shape

((8705, 628), (6391, 628))

In [172]:
X_train=train_data
X_test=test_data

In [173]:
X_train.columns.values.tolist()

['row_id',
 'academics__program_assoc_agriculture',
 'academics__program_assoc_architecture',
 'academics__program_assoc_biological',
 'academics__program_assoc_business_marketing',
 'academics__program_assoc_communication',
 'academics__program_assoc_communications_technology',
 'academics__program_assoc_computer',
 'academics__program_assoc_construction',
 'academics__program_assoc_education',
 'academics__program_assoc_engineering',
 'academics__program_assoc_engineering_technology',
 'academics__program_assoc_english',
 'academics__program_assoc_ethnic_cultural_gender',
 'academics__program_assoc_family_consumer_science',
 'academics__program_assoc_health',
 'academics__program_assoc_history',
 'academics__program_assoc_humanities',
 'academics__program_assoc_language',
 'academics__program_assoc_legal',
 'academics__program_assoc_library',
 'academics__program_assoc_mathematics',
 'academics__program_assoc_mechanic_repair_technology',
 'academics__program_assoc_military',
 'academ

In [174]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0)
sel.fit(X_train)  # fit finds the features with zero variance

VarianceThreshold(threshold=0)

In [175]:
# get_support is a boolean vector that indicates which features are retained
# if we sum over get_support, we get the number of features that are not constant
sum(sel.get_support())

628

In [176]:
# another way of finding non-constant features:
len(X_train.columns[sel.get_support()])

628

In [177]:
# print the constant features
print(
    len([
        x for x in X_train.columns
        if x not in X_train.columns[sel.get_support()]
    ]))

[x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]

0


[]

In [178]:
X_train.columns.values.tolist()

['row_id',
 'academics__program_assoc_agriculture',
 'academics__program_assoc_architecture',
 'academics__program_assoc_biological',
 'academics__program_assoc_business_marketing',
 'academics__program_assoc_communication',
 'academics__program_assoc_communications_technology',
 'academics__program_assoc_computer',
 'academics__program_assoc_construction',
 'academics__program_assoc_education',
 'academics__program_assoc_engineering',
 'academics__program_assoc_engineering_technology',
 'academics__program_assoc_english',
 'academics__program_assoc_ethnic_cultural_gender',
 'academics__program_assoc_family_consumer_science',
 'academics__program_assoc_health',
 'academics__program_assoc_history',
 'academics__program_assoc_humanities',
 'academics__program_assoc_language',
 'academics__program_assoc_legal',
 'academics__program_assoc_library',
 'academics__program_assoc_mathematics',
 'academics__program_assoc_mechanic_repair_technology',
 'academics__program_assoc_military',
 'academ

In [179]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

  if np.issubdtype(mask.dtype, np.int):


((8705, 628), (6391, 628))

In [180]:
# re-assemble the dataframe
X_train = pd.DataFrame(data=X_train)
X_test = pd.DataFrame(data=X_test)

## Removing quasi-constant features
### Using variance threshold from sklearn

Variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

Here, I will change the default threshold to remove almost / quasi-constant features.

In [182]:
sel = VarianceThreshold(threshold=0.01)  # 0.1 indicates 99% of observations approximately

sel.fit(X_train)  # fit finds the features with low variance

VarianceThreshold(threshold=0.01)

In [183]:
# get_support is a boolean vector that indicates which features 
# are retained. If we sum over get_support, we get the number
# of features that are not quasi-constant
sum(sel.get_support())

431

In [184]:
# finally we can print the quasi-constant features
print(
    len([
        x for x in X_train.columns
        if x not in X_train.columns[sel.get_support()]
    ]))

[x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]

197


[23,
 46,
 61,
 67,
 71,
 78,
 96,
 99,
 116,
 127,
 134,
 137,
 153,
 154,
 155,
 157,
 158,
 159,
 161,
 162,
 164,
 165,
 166,
 168,
 169,
 170,
 171,
 172,
 173,
 175,
 176,
 177,
 179,
 180,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 191,
 192,
 193,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 206,
 208,
 209,
 210,
 211,
 213,
 214,
 215,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 227,
 321,
 322,
 375,
 376,
 379,
 380,
 381,
 382,
 384,
 399,
 400,
 402,
 403,
 406,
 408,
 409,
 410,
 411,
 413,
 414,
 416,
 417,
 425,
 426,
 427,
 428,
 429,
 434,
 435,
 436,
 442,
 443,
 444,
 445,
 446,
 448,
 449,
 450,
 452,
 453,
 454,
 456,
 467,
 471,
 474,
 482,
 483,
 500,
 502,
 505,
 507,
 510,
 517,
 522,
 523,
 524,
 525,
 526,
 527,
 528,
 529,
 530,
 531,
 532,
 533,
 534,
 535,
 536,
 537,
 538,
 539,
 540,
 541,
 542,
 544,
 545,
 546,
 547,
 548,
 549,
 550,
 552,
 553,
 554,
 555,
 556,
 557,
 558,
 559,
 560,
 561,
 562,
 563,
 564,


In [185]:
# remove the features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

  if np.issubdtype(mask.dtype, np.int):


((8705, 431), (6391, 431))

## Removing duplicate features

In [186]:
# re-assemble the dataframe
X_train = pd.DataFrame(data=X_train)
X_test = pd.DataFrame(data=X_test)

In [187]:
type(X_train)

pandas.core.frame.DataFrame

In [188]:
# transpose the dataframe, so that the columns are the rows of the new dataframe
data_t = X_train.T
data_t.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704
0,3.0,4.0,5.0,6.0,7.0,8.0,11.0,12.0,13.0,15.0,...,15083.0,15084.0,15085.0,15086.0,15088.0,15089.0,15091.0,15093.0,15094.0,15095.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,2.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0


In [189]:
# check if there are duplicated rows (the columns of the original dataframe)
# this is a computionally expensive operation, so it might take a while
# sum indicates how many rows are duplicated

data_t.duplicated().sum()

4

In [190]:
# visualise the duplicated rows (the columns of the original dataframe)
data_t[data_t.duplicated()]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704
212,10604.0,152.0,235.0,153.0,11355.0,1716.0,3323.0,47411.0,6387.0,406.0,...,584.0,34065.0,1820.0,57.0,71.0,690.0,2167.0,4158.0,368.0,1903.0
292,18610.767905,20467.472813,18326.021726,9369.671537,18655.059417,12579.423606,19982.453351,25085.193592,24373.211259,12198.823309,...,17186.04986,34837.485752,31537.188222,10693.911128,14154.246713,16091.738195,47509.879577,17742.19208,25676.411985,27446.3894
309,11099.0,173.0,359.0,198.0,22631.0,2078.0,5459.0,31442.0,6235.0,635.0,...,1070.0,29657.0,1433.0,59.0,84.0,805.0,1472.0,5132.0,299.0,3569.0
324,11099.0,173.0,359.0,198.0,22631.0,2078.0,5459.0,31442.0,6235.0,635.0,...,1070.0,29657.0,1433.0,59.0,84.0,805.0,1472.0,5132.0,299.0,3569.0


In [191]:
# we can capture the duplicated features, by capturing the
# index values of the transposed dataframe like this:
duplicated_features = data_t[data_t.duplicated()].index.values
duplicated_features

array([212, 292, 309, 324])

In [192]:
# alternatively, we can remove the duplicated rows,
# transpose the dataframe back to the variables as columns
# keep first indicates that we keep the first of a set of
# duplicated variables

data_unique = data_t.drop_duplicates(keep='first').T
X_train=data_unique
X_train.shape

(8705, 427)

In [193]:
X_test.drop(X_test.columns[duplicated_features],axis=1,inplace=True)
X_test.shape

(6391, 427)

In [194]:
# to find those columns in the original dataframe that were removed:

duplicated_features = [col for col in X_train.columns if col not in data_unique.columns]
duplicated_features 

[]

In [195]:
# find and remove correlated features
# to reduce the feature space

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  110


In [196]:
# re-assemble the dataframe
X_train = pd.DataFrame(data=X_train)
X_test = pd.DataFrame(data=X_test)

X_train.shape, X_test.shape

((8705, 427), (6391, 427))

In [197]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((8705, 317), (6391, 317))

In [198]:
X_train.columns.values.tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 49,
 50,
 51,
 53,
 55,
 56,
 58,
 59,
 60,
 61,
 65,
 66,
 67,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 158,
 167,
 171,
 180,
 181,
 182,
 184,
 185,
 189,
 199,
 201,
 215,
 216,
 217,
 218,
 220,
 222,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 244

In [199]:
# step backward greedy selection algorithm

#sfs1 = SFS(RandomForestRegressor(n_jobs=4), 
#           k_features=200, 
#           forward=False, 
#           floating=False, 
#           verbose=2,
#           scoring='r2',
#           cv=3)

#sfs1 = sfs1.fit(np.array(X_train), y_train.values.ravel())

In [200]:
#sfs1.k_feature_idx_

In [201]:
#X_train.columns[list(sfs1.k_feature_idx_)]

### Split the Dataset into Training and Test Datasets

In [202]:
from sklearn.model_selection import train_test_split
X_train, X_traintest, y_train, y_test = train_test_split(X_train, train_labels, test_size=0.3, random_state=456)

In [203]:
# Feature Scaling
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)
#sc_y = StandardScaler()
#y_train = sc_y.fit_transform(y_train)

In [204]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression(normalize=True)
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [205]:
y_pred = regressor.predict(X_traintest)

In [206]:
print(regressor.score(X_traintest,y_test))

0.8246751671135095


In [207]:
test_prediction = regressor.predict(X_test)

In [208]:
test_prediction

array([[44.31018109],
       [45.38797907],
       [32.67038799],
       ...,
       [37.95381325],
       [63.00279539],
       [28.94882813]])

In [209]:
print('LinearRegression train mse: {}'.format(mean_squared_error(y_test, y_pred)))

LinearRegression train mse: 77.81448743231384


In [210]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

In [211]:
xgb_model = xgb.XGBRegressor()

#eval_set = [(X_test[training_vars], y_test)]
xgb_model.fit(X_train, y_train, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [212]:
print(xgb_model.score(X_traintest,y_test))

0.8605276464919013


In [213]:
pred = xgb_model.predict(X_traintest)
print('xgb train mse: {}'.format(mean_squared_error(y_test, pred)))

xgb train mse: 61.90206783905878


In [214]:
test_prediction = xgb_model.predict(X_test)
#print('xgb test mse: {}'.format(mean_squared_error(y_test, pred)))

In [215]:
test_prediction

array([45.124306, 47.91629 , 33.356853, ..., 37.434345, 61.834267,
       25.052172], dtype=float32)

In [216]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [217]:
print(rf_model.score(X_traintest,y_test))

0.8538374251791658


In [218]:
pred = rf_model.predict(X_traintest)
print('rf train mse: {}'.format(mean_squared_error(y_test, pred)))

rf train mse: 64.87139131530758


In [219]:
pred = rf_model.predict(X_test)

In [220]:
pred

array([48.41492616, 48.28351698, 33.24210618, ..., 35.8343924 ,
       72.16162643, 19.95148062])

In [163]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
print(RMSE)

8.821252033147779


In [164]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
accuracies.mean()

0.8246889166644744

In [165]:
accuracies.std()

0.009227013669294955

In [172]:
test_prediction

array([[44.31018109],
       [45.38797907],
       [32.67038799],
       ...,
       [37.95381325],
       [63.00279539],
       [28.94882813]])

#### Function to generate submission file

In [177]:
import re
import time

def gen_submission():

    """ Generates submission file for contest.
    
    :param model: Model with which to score test data.
    :param test: Test data.
    
    """
    
    # create time stamp
    time_stamp = re.sub('[: ]', '_', time.asctime())

    # create predictions column
    #sub = test['row_id'].cbind(model.predict(test).exp())
    #sub.columns = ['row_id', 'repayment_rate']
    
    # save file for submission
    sub_fname = 'submission_' + time_stamp + '.csv'

    # Create a submission file
    #submission = pd.DataFrame({"row_id":test_data["row_id"],"repayment_rate":test_prediction})
    
    #submission = pd.DataFrame({"repayment_rate":test_prediction,index=test_data["row_id"]})
    # Save submission to CSV
    #submission.to_csv(sub_fname, index=False)        # Do not save index values
    
    i=0
    file=open(sub_fname,'w')
    header="row_id,repayment_rate"
    header=header+'\n'
    file.write(header)
    for id in (X_test[0]):
        #predval=test_prediction[i].item()
        str="{},{}".format(id,test_prediction[i].item())
        str=str+'\n'
        #print(predval)
        file.write(str)
        i+=1




In [178]:
gen_submission()