In [54]:
# Read into a dataframe a CSV dataset with 10 example offices 
# described by 5 features (4 descriptive features: Size, Floor, BroadbandRate, EnergyRating;
# the target feature: RentalPrice).

# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# Allows plots to appear directly in the notebook.
%matplotlib inline

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [55]:
# Read a CSV dataset with 10 example offices into a dataframe.
# The data is described by 5 features (4 descriptive features: Size, Floor, BroadbandRate, EnergyRating;
# the target feature: RentalPrice).


# Read csv file into a dataframe.
df = pd.read_csv('CreditRisk_Clean_1-3_NewFeatures.csv')
df.head(10)

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,PercentRiskPerformance,IncorrectRiskPerformancePercentage,IncorrectTotalTrades,IncorrectRiskPerformanceNum,Avrg60And90Days
0,Bad,61,246,20,95,18,2,2,86,11,...,0,5,1,2,88,3.333333,Correct,Correct,Correct,2.0
1,Good,72,156,2,53,16,2,0,89,50,...,50,2,2,1,44,1.298701,Incorrect,Correct,Correct,1.0
2,Bad,73,132,9,47,4,0,0,100,0,...,0,1,1,1,100,0.230415,Incorrect,Correct,Incorrect,0.0
3,Good,62,21,12,17,4,0,0,100,0,...,83,1,2,1,75,0.230415,Correct,Correct,Correct,0.0
4,Good,85,320,17,104,18,0,0,94,46,...,75,1,2,0,27,0.359712,Incorrect,Correct,Correct,0.0
5,Good,89,150,19,69,12,0,0,100,0,...,55,1,2,0,43,0.230415,Correct,Correct,Correct,0.0
6,Good,86,337,1,102,21,0,0,100,0,...,100,2,4,0,50,0.230415,Correct,Correct,Correct,0.0
7,Good,67,160,4,49,27,0,0,97,48,...,76,5,2,0,70,0.359712,Incorrect,Correct,Correct,0.0
8,Bad,70,0,11,36,7,0,0,88,20,...,100,1,2,0,75,0.359712,Correct,Correct,Correct,0.0
9,Bad,74,172,0,82,14,0,0,81,5,...,0,2,1,0,43,0.359712,Correct,Correct,Correct,0.0


In [56]:
# find out how many rows to calculate percentages when creating trainning and testing datasets
len(df.index)

937

# (1.1) Randomly shuffle the rows of dataset and split the dataset into two datasets: 70% training and 30% test. Keep the test set aside.

In [57]:
# Random shuffling of the rows in the data frame - so as to be fair in our selection of the trainning and test data

df = df.reindex(np.random.permutation(df.index))
df.head(10)

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,PercentRiskPerformance,IncorrectRiskPerformancePercentage,IncorrectTotalTrades,IncorrectRiskPerformanceNum,Avrg60And90Days
647,Bad,64,175,20,115,13,2,0,56,35,...,74,9,1,6,91,1.298701,Correct,Correct,Correct,1.0
435,Good,73,138,15,83,19,1,1,95,17,...,0,2,1,0,43,3.333333,Incorrect,Correct,Correct,1.0
89,Good,79,171,34,95,6,0,0,100,0,...,0,0,1,0,100,0.230415,Correct,Correct,Correct,0.0
660,Good,86,241,4,79,25,0,0,100,0,...,0,2,0,0,22,0.230415,Correct,Correct,Correct,0.0
851,Good,73,187,11,77,24,0,0,100,0,...,87,4,3,2,100,0.230415,Correct,Correct,Correct,0.0
276,Bad,61,210,4,64,28,0,0,93,9,...,95,10,2,4,86,0.359712,Correct,Correct,Incorrect,0.0
599,Bad,65,148,1,65,16,0,0,100,0,...,69,7,5,6,92,0.230415,Incorrect,Correct,Correct,0.0
710,Bad,54,187,3,75,30,0,0,87,0,...,98,4,1,2,55,0.359712,Correct,Correct,Incorrect,0.0
116,Good,84,339,3,115,27,0,0,100,0,...,0,2,1,0,23,0.230415,Correct,Correct,Correct,0.0
781,Good,89,200,1,69,15,0,0,100,0,...,0,2,1,0,75,0.230415,Correct,Correct,Correct,0.0


In [58]:
# create variable to obtain what the 70% of the data set is
seventy_percent = round(len(df.index)/100*70)

# Create dataframe with the first 70% of the data and show the first 10 rows - this dataframe will be the trainning data
df_train = df[0:seventy_percent]
df_train.head(10)

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,PercentRiskPerformance,IncorrectRiskPerformancePercentage,IncorrectTotalTrades,IncorrectRiskPerformanceNum,Avrg60And90Days
647,Bad,64,175,20,115,13,2,0,56,35,...,74,9,1,6,91,1.298701,Correct,Correct,Correct,1.0
435,Good,73,138,15,83,19,1,1,95,17,...,0,2,1,0,43,3.333333,Incorrect,Correct,Correct,1.0
89,Good,79,171,34,95,6,0,0,100,0,...,0,0,1,0,100,0.230415,Correct,Correct,Correct,0.0
660,Good,86,241,4,79,25,0,0,100,0,...,0,2,0,0,22,0.230415,Correct,Correct,Correct,0.0
851,Good,73,187,11,77,24,0,0,100,0,...,87,4,3,2,100,0.230415,Correct,Correct,Correct,0.0


In [80]:
# double check that it is infact 70 percent
print(len(df_training.index) == seventy_percent)
print(len(df_training.index))

True
656


In [87]:
# Create df_test as the remaining 30% of the data frame to apply the testing of the model

df_test = df[seventy_percent:]
print(len(df_test))
df_test.head(10)

281


Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,PercentRiskPerformance,IncorrectRiskPerformancePercentage,IncorrectTotalTrades,IncorrectRiskPerformanceNum,Avrg60And90Days
494,Good,84,397,71,199,8,0,0,88,65,...,0,0,0,0,50,0.359712,Incorrect,Correct,Correct,0.0
766,Good,77,368,29,130,16,1,1,71,29,...,31,4,1,0,50,2.777778,Incorrect,Correct,Correct,1.0
149,Bad,74,158,6,73,14,0,0,93,9,...,28,2,3,0,83,0.359712,Correct,Correct,Incorrect,0.0
686,Bad,58,375,72,137,7,4,3,30,2,...,0,2,0,1,50,1.351351,Correct,Correct,Correct,3.5
209,Good,83,193,9,94,24,0,0,100,0,...,47,2,5,0,80,0.230415,Correct,Correct,Correct,0.0
884,Good,86,193,8,73,25,0,0,100,0,...,0,3,1,0,50,0.230415,Correct,Correct,Correct,0.0
808,Bad,58,54,3,25,26,0,0,100,0,...,97,8,6,5,74,0.230415,Incorrect,Correct,Incorrect,0.0
603,Bad,65,196,1,54,40,0,0,100,0,...,71,12,3,2,63,0.230415,Incorrect,Correct,Correct,0.0
839,Good,76,230,6,76,22,0,0,82,38,...,0,3,1,1,33,0.359712,Incorrect,Correct,Correct,0.0
251,Good,79,196,3,50,13,0,0,100,0,...,0,2,1,2,60,0.230415,Correct,Correct,Correct,0.0


In [86]:
# Ensure that the two dataframes created are indeed 70% and 30% and add up to the total of rows
# 100% of the dataframe was 937 rows - above shows 656 rows for the 70% df and 281 rows for the 30% df totaling 937 rows
# or 100%

print((seventy_percent) + round(len(df.index)/100*30))
print(len(df_test) + len(df_train))

937
937
