# Modelling Functions

In [1]:
from IPython.core.display import display, HTML

# Make notebooks expand 100%.
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# This method gives a plot with the distribution of monthly sales, sales and year sold houses.
# Parameters 
# model (Model) : The model we want to use.
# parameters (List) : List of parameters for the model.
# X_train_ (Array) : Array with values.
# y_train_ (Array) : Array with values.
# X_valid_ (Array) : Array with values.
# y_valid_ (Array) : Array with values.
# score_type (String) : Score type.
# verbose_ (Boolean) : Verbose or not.
# cv_ (Integer) : Number of folds.
# Returns : Best model, the best score, best estimator, RMSE and the mean of validation score.
def model_results(model, parameters, X_train_, y_train_, X_valid_, y_valid_, score_type='r2', verbose_=False, cv_=10):    
    grid = GridSearchCV(model, parameters, verbose=verbose_, scoring=score_type, n_jobs=-1)
    
    grid.fit(X_train_, y_train_)
    
    grid.best_estimator_.fit(X_train_, y_train_)
    
    prediction = grid.best_estimator_.predict(X_valid_)
     
    RMSE = np.sqrt(mean_squared_error(y_valid_, prediction))
    
    validation_score = cross_val_score(grid.best_estimator_, X_train_, y_train_, cv=cv_, scoring=score_type)    
    
    print("Best Model: " + str(grid.best_estimator_) +
          "\n Best Score: " + str(grid.best_score_) +
          "\n RMSE Score: " + str(RMSE) + 
          "\n Cross Validation Score: " + str(np.mean(validation_score))
         )
       
    return(grid, grid.best_estimator_, grid.best_score_, RMSE, np.mean(validation_score))

In [3]:
# This method gives a plot with the distribution of monthly sales, sales and year sold houses.
# Parameters 
# model (Model) : The model we want to use.
# parameters (List) : List of parameters for the model.
# X_train_ (Array) : Array with values.
# y_train_ (Array) : Array with values.
# X_valid_ (Array) : Array with values.
# y_valid_ (Array) : Array with values.
# train_color_ (String) : The color for the train values.
# test_color_ (String) : The color for the test values.
# line_color_ (String) : The color for the line.
# Returns : The residual plot.
def plot_residuals(model, X_train_, y_train_, X_valid_, y_valid_, train_color_='#9999ff', test_color_='r', line_color_='#000000'):    
    visualizer = ResidualsPlot(model, train_color=train_color_, test_color=test_color_, line_color=line_color_)
    
    visualizer.fit(X_train_, y_train_)
    
    visualizer.score(X_valid_, y_valid_)
    
    plot = visualizer.poof(outpath="residual_plot")
    
    return(plot)

In [4]:
# This method calculates the cumsum of the PCA.
# Parameters
# df (Dataframe) : The data frame we want to use.
# Returns : The cumsum of the PCA.
def caluclate_cumsum(df):
    pca = PCA()
    
    X_proj = pca.fit_transform(df)
    
    pca.fit(df)
    
    variance = pd.DataFrame(pca.explained_variance_ratio_)
    
    return(np.cumsum(pca.explained_variance_ratio_))

In [5]:
# This method plots PCA.
# Parameters
# df (Dataframe) : The data frame we want to use.
# y (Array) : The target variable.
# Returns : The plot of the PCA.
def plot_pca(df, y):    
    X_temp = PCA().fit_transform(df)
    
    colors = cm.rainbow(np.linspace(0, 1, len(y)))
    
    plt.scatter(X_temp[:,0], X_temp[:,1],c=colors)
    
    return(plt.show())

In [6]:
# This method plots the variance of the PCA.
# Parameters
# df (Dataframe) : The data frame we want to use.
# Returns : The plot with the variance of the PCA.
def plot_pca_variance(df):
    pca_dp = PCA().fit(df)
    
    plt.semilogx(np.cumsum(pca_dp.explained_variance_ratio_))
    
    plt.xlabel('Number of Components')
    
    plt.ylabel('Variance retained')
    
    return(plt.show())

In [7]:
# Produces a confusion metric.
# Parameters
# X (Dataframe) : The predictors.
# y (Array) : The prediction column.
# Returns : The confusion metric.
def metrics_confusion(model, X, y):    
    model.fit(X, y)
    
    predictions = model.predict(X)
    
    print("Model: " + str(type(model)))
    
    print(metrics.confusion_matrix(y, predictions))
    
    print("\n")

In [8]:
# Produces a classic report metric.
# Parameters
# model (Model) : The model type.
# X_train (Array) : X train data.
# y_train (Array) : y train data.
# X_valid (Array) : X valid data.
# y_valid (Array) : y valid data.
# Returns : The classic report metric.
def classic_report(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    
    predicted = model.predict(X_valid)
    
    report = classification_report(y_valid, predicted)
    
    print(report + "\n\n")