In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
mpl.rcParams['savefig.dpi'] = 128
mpl.rcParams['figure.dpi'] = 128
# Plot size to 14" x 7"
mpl.rc('figure', figsize = (14, 7))
# Font size to 14
mpl.rc('font', size = 14)
# Do not display top and right frame lines
mpl.rc('axes.spines', top = False, right = False)
# Remove grid lines
mpl.rc('axes', grid = False)
# Set backgound color to white
mpl.rc('axes', facecolor = 'white')

In [2]:
df = pd.read_csv("2011-2017.csv")

## Get fatures and target

In [3]:
from sklearn.preprocessing import LabelEncoder

target = 'enquiry status';
# df = df.iloc[:number_of_rows]
X = df[df.keys()]
X = df.loc[:,df.columns != target]

le = LabelEncoder()
y = df[target]
y = le.fit_transform(y)

## Encoding Category data

In [4]:
X = pd.get_dummies(X)

In [5]:
X

Unnamed: 0,post code,loan amount,year,month,day,hour,weekday,marketing code_1657,marketing code_1824,marketing code_2048,...,"loan reason_Refinance,Other Loan Reason","loan reason_Refinance,Refinance With Cash Out","loan reason_Refinance,Refinance With Cash Out,Loan Topup",property use_Investment,"property use_Investment,Other",property use_Other,property use_Residence,"property use_Residence,Investment","property use_Residence,Investment,Other","property use_Residence,Other"
0,2008,200000.0,2011,1,5,14,2,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,3350,198000.0,2011,1,10,11,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2009,568000.0,2011,1,12,10,2,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,6107,240000.0,2011,1,12,16,2,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4160,380000.0,2011,1,14,16,4,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,5074,339000.0,2011,1,15,10,5,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,6011,1350000.0,2011,1,18,17,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,5032,534000.0,2011,1,20,10,3,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,3818,250000.0,2011,1,20,14,3,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,6240,50000.0,2011,1,21,15,4,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Handling Highly Correlated Features

In [6]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
to_drop

[]

## Reducing features by maximizing class separability

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Create and run LDA
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(X, y)

# Create array of explained variance ratios
lda_var_ratios = lda.explained_variance_ratio_

# Create function
def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0

    # Set initial number of features
    n_components = 0

    # For the explained variance of each feature:
    for explained_variance in var_ratio:

        # Add the explained variance to the total
        total_variance += explained_variance

        # Add one to the number of components
        n_components += 1

        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break

    # Return the number of components
    return n_components

# Run function
select_n_components(lda_var_ratios, 0.95)

In [None]:
# Create and run an LDA, then use it to transform the features
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(X, y).transform(features)

# Print the number of features
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_lda.shape[1])

## Recursively eliminating features

In [None]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

# Create a linear regression
ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(X, y)
rfecv.transform(X)

## Removing irrelevant features for classification

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(X, y)

# Show results
print("Original number of features:", X.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

In [None]:
X = MultiColumnLabelEncoder(columns=category_column.columns).fit_transform(X)

In [None]:
from sklearn.covariance import EllipticEnvelope

# Create detector
outlier_detector = EllipticEnvelope(contamination=.1)

# Fit detector
outlier_detector.fit(X)

# Predict outliers
result = outlier_detector.predict(X)

for i in range(len(result)) :
    if result[i] < -1:
        print(i)

In [None]:
# Create one feature
feature = X.iloc[:,1]

# Create a function to return index of outliers
def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

# Run function
indicies_of_outliers(feature)

In [None]:
from sklearn.cluster import KMeans

# # Create DataFrame
# dataframe = pd.DataFrame(X, columns=["feature_1", "feature_2"])

# Make k-means clusterer
clusterer = KMeans(3, random_state=0)

# Fit clusterer
clusterer.fit(X)

# Predict values
X["group"] = clusterer.predict(X)

In [None]:
X["group"].value_counts()

In [None]:
# X[X["group"] == 1]

In [None]:
from sklearn.externals import joblib
model_columns = joblib.load('app/models/model_columns.pkl')

In [None]:
model_columns

In [None]:
# mce = MultiColumnLabelEncoder(columns = encoded_columns);
# df = mce.fit_transform(df)

X = df[df.keys()]
X = df.loc[:,df.columns != target]
y = df[target]

In [None]:
print("Keys of enquiries_dataset: \n{}".format(X.keys()))

In [None]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.3,
                     random_state=0,
                     stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.learning_curve import learning_curve
from sklearn.pipeline import make_pipeline

pipe_svm = make_pipeline(MultiColumnLabelEncoder(columns = encoded_columns),StandardScaler(),
                         PCA(n_components=2),LogisticRegression(random_state=1))


pipe_svm.fit(X_train, y_train)
y_svm_pred = pipe_svm.predict(X_test)
y_svm_train_pred = pipe_svm.predict(X_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe_rfc = make_pipeline(StandardScaler(),PCA(n_components=2),RandomForestClassifier(n_estimators=10,
                                                                                    random_state=1))

pipe_rfc.fit(X_train, y_train)
y_rfc_train_pred = pipe_rfc.predict(X_train)
y_rfc_pred = pipe_rfc.predict(X_test)

In [None]:
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score

print("LogisticRegression")
scores = cross_validation.cross_val_score(pipe_svm, X, y, cv=3)
print("cross_validation: {:.2f}".format(np.mean(scores, axis=0)))
print(classification_report(y_test, y_svm_pred))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_svm_train_pred),r2_score(y_test, y_svm_pred)))

print("RandomForest")
scores = cross_validation.cross_val_score(pipe_rfc, X, y, cv=3)
print("cross_validation: {:.2f}".format(np.mean(scores, axis=0)))
print(classification_report(y_test, y_rfc_pred))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_rfc_train_pred),r2_score(y_test, y_rfc_pred)))

In [None]:
from sklearn.externals import joblib
joblib.dump(pipe_rfc, 'app/models/rfpipeline.pkl')

model_columns = list(X.columns)
joblib.dump(model_columns, 'app/models/rf_model_columns.pkl')