# Model Performance Evaluation

In [1]:
# import packages
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import var
from numpy import std

In [2]:
# set seed
seed(1)

In [3]:
# generate data set of 100 numbers drawn randomly from a Gaussian distribution with mean 50 and st. dev. 5
data = 5 * randn(10000) + 50

In [4]:
# Calculate statistics
print('Mean: %.3f' % mean(data))
print('Variance: %.3f' % var(data))
print('Standard Deviation: %.3f' % std(data))

Mean: 50.049
Variance: 24.939
Standard Deviation: 4.994


# Correlations

In [6]:
# import packages
import pandas as pd
import numpy as np

# import Pima Indian Diabetes data set
data = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",header=None)

# display summary stats
data.describe()

# Replace 0 values with NaN
data[[1,2,3,4,5]] = data[[1,2,3,4,5]].replace(0, np.NaN)

# impute based on column mean
data.fillna(data.mean(), inplace=True)

In [7]:
# display head of Pima Indians Diabetes data set
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [8]:
# Create the matrix of correlations
scoreTable = data.corr(method='pearson')

In [10]:
# import pyplot
import matplotlib.pyplot as plt

# Visulaize the matrix
data.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.13,0.21,0.083,0.056,0.022,-0.034,0.54,0.22
1,0.13,1.0,0.22,0.19,0.42,0.23,0.14,0.27,0.49
2,0.21,0.22,1.0,0.19,0.073,0.28,-0.0028,0.32,0.17
3,0.083,0.19,0.19,1.0,0.16,0.54,0.1,0.13,0.22
4,0.056,0.42,0.073,0.16,1.0,0.17,0.099,0.14,0.21
5,0.022,0.23,0.28,0.54,0.17,1.0,0.15,0.026,0.31
6,-0.034,0.14,-0.0028,0.1,0.099,0.15,1.0,0.034,0.17
7,0.54,0.27,0.32,0.13,0.14,0.026,0.034,1.0,0.24
8,0.22,0.49,0.17,0.22,0.21,0.31,0.17,0.24,1.0


# Non-parametric Statistics

In [11]:
# import dependencies
from scipy.stats import mannwhitneyu
from numpy.random import rand
from numpy.random import seed

# set seed for number generation
seed(1)

In [12]:
# generate two independent samples
data1 = 50 + (rand(100) * 10)
data2 = 51 + (rand(100) * 10)

In [16]:
# compare samples
stat, p = mannwhitneyu(data1, data2)
print("Statistic = %.3f, p = %.3f" %(stat, p))

# interpret output
alpha = 0.05
if p > alpha:
    print("Same distribution (Fail to Reject H0)")
else:
    print("Different distribution (Reject H0)")

Statistic = 4077.000, p = 0.012
Different distribution (Reject H0)
