# Python samples for chapter 4 - classification and design matrix

# Estimator Example

In [None]:
import numpy as np
np.random.seed(200)
nsamples = 100
ndraws = 100
sequences_100 = np.array([])

separate_100 = np.empty((ndraws,nsamples))
for idx in range(ndraws):
  values = np.round(np.random.normal(100,20,nsamples))
  sequences_100 = np.concatenate((sequences_100, values ))
  separate_100[idx,:] = values

print(np.mean(sequences_100))
print(np.std(sequences_100))
print(np.mean(np.mean(separate_100,axis=0)))
print(np.std(np.mean(separate_100,axis=0)))

# Central Limit Theorem Illustration

In [None]:
import numpy as np
import seaborn as sns

np.random.seed(200)
nsamples = 1000
ndraws = 30
sequences_1000 = np.array([])

separate_1000 = np.empty((ndraws,nsamples))
for idx in range(ndraws):
  values = np.round(np.random.normal(100,20,nsamples))
  sequences_1000 = np.concatenate((sequences_1000, values ))
  separate_1000[idx,:] = values
  
sns.distplot(separate_1000)


In [None]:
print(np.mean(sequences_1000))
print(np.std(sequences_1000))
print(np.mean(np.mean(separate_1000,axis=0)))
print(np.std(np.mean(separate_1000,axis=0)))

99.96503333333334
19.90130675773718
99.96503333333332
3.6885734368301373


# Plot distribution of means

In [None]:
import numpy as np
import seaborn as sns

np.random.seed(200)
nsamples = 1000
ndraws = 30
sequences_1000 = np.array([])

separate_1000 = np.empty((ndraws,nsamples))
for idx in range(ndraws):
  values = np.round(np.random.normal(100,20,nsamples))
  sequences_1000 = np.concatenate((sequences_1000, values ))
  separate_1000[idx,:] = values
  
sns.distplot(np.mean(separate_1000,axis=0))
plt.savefig("clt_hist.png",dpi=300)

print(np.mean(sequences_1000))
print(np.std(sequences_1000))
print(np.mean(np.mean(separate_1000,axis=0)))
print(np.std(np.mean(separate_1000,axis=0)))



# Pair plot for Iris species dataset

In [None]:
from sklearn import datasets
import seaborn as sns
import pandas as pd
iris = datasets.load_iris()
df = pd.DataFrame(iris.data)
df['y'] = iris.target
sns.pairplot(df, hue='y')

# Explore collinearity in dataset

In [None]:
import numpy as np
import seaborn as sns
from sklearn.metrics import pairwise_distances

X = np.random.random((10,10))
dist_mat = pairwise_distances(X, metric='cosine')
sns.heatmap(dist_mat)

# Logistic Regression Illustration

In [None]:
import sklearn.linear_model
scaler = preprocessing.StandardScaler().fit(X)
X_train = scaler.transform(X)

estimator = sklearn.linear_model.LogisticRegression(
fit_intercept=True, n_jobs=2)

classifier = estimator.fit(X_train,y)
y_hat = classifier.predict(X_train)

for idx in range(10):
  print(y[idx],y_hat[idx])

print( np.sum(np.abs(y-y_hat)) )


# Decision Tree illustration

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# create dataset
np.random.seed(10)
cov1 = np.array(([10,1],[1,10]))
mean1 = np.array([35,35])
data1 = np.random.multivariate_normal(mean1, cov1,100)
df1 = pd.DataFrame(data1, columns=['dim1','dim2'])
df1['label'] = 1
mean2 = np.array([25,25])
cov1 = np.array(([20,0.5],[0.5,20]))
data2 = np.random.multivariate_normal(mean2, cov1,100)
df2 = pd.DataFrame(data2, columns=['dim1','dim2'])
df2['label'] = 0
mean3 = np.array([25,40])
data3 = np.random.multivariate_normal(mean3, cov1,100)
df3 = pd.DataFrame(data3, columns=['dim1','dim2'])
df3['label'] = 0
mean4 = np.array([40,25])
data4 = np.random.multivariate_normal(mean4, cov1,100)
df4 = pd.DataFrame(data4, columns=['dim1','dim2'])
df4['label'] = 0

# plot dataset
df = pd.concat((df1,df2,df3,df4))
plt.figure(figsize=(10,10),dpi=300)
sns.set_style('darkgrid')
print("df shape",df.shape)
p = sns.lmplot(data=df, x='dim1', y='dim2', fit_reg=False, hue='label', scatter_kws={'alpha':0.3})
p.axes[0,0].set_xlim(0,100)
p.axes[0,0].set_ylim(0,100)
plt.savefig('decision-tree.png',dpi=300)


In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

np.random.seed(10)
cov1 = np.array(([10,1],[1,10]))
mean1 = np.array([35,35])
data1 = np.random.multivariate_normal(mean1, cov1,100)
df1 = pd.DataFrame(data1, columns=['dim1','dim2'])
df1['label'] = 1

mean2 = np.array([25,25])
cov1 = np.array(([20,0.5],[0.5,20]))
data2 = np.random.multivariate_normal(mean2, cov1,100)
df2 = pd.DataFrame(data2, columns=['dim1','dim2'])
df2['label'] = 0

mean3 = np.array([25,40])
data3 = np.random.multivariate_normal(mean3, cov1,100)
df3 = pd.DataFrame(data3, columns=['dim1','dim2'])
df3['label'] = 0

mean4 = np.array([40,25])
data4 = np.random.multivariate_normal(mean4, cov1,100)
df4 = pd.DataFrame(data4, columns=['dim1','dim2'])
df4['label'] = 0

df = pd.concat((df1,df2,df3,df4))
plt.figure(figsize=(10,10),dpi=300)
sns.set_style('darkgrid')
print("df shape",df.shape)
p = sns.lmplot(data=df, x='dim1', y='dim2', fit_reg=False, hue='label', scatter_kws={'alpha':0.3})
p.axes[0,0].set_xlim(0,100)
p.axes[0,0].set_ylim(0,100)
plt.vlines([30.334,36.58],ymin=5,ymax=65, alpha=0.3)
plt.hlines([31.857],xmin=5,xmax=65, alpha=0.3)
plt.savefig('decision-tree-lines.png',dpi=300)

In [None]:
from sklearn import tree
estimator = tree.DecisionTreeClassifier(max_depth=2)
classifier = estimator.fit(df.iloc[:,0:2],df['label'])

In [None]:
import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None)
graph = graphviz.Source(dot_data)
graph.format = 'png'
graph.render("non-linear")

# Support Vector Machine Illustration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.lines as lines

np.random.seed(10)
cov1 = np.array(([10,1],[1,10]))
mean1 = np.array([35,35])
data1 = np.random.multivariate_normal(mean1, cov1,100)
df1 = pd.DataFrame(data1, columns=['dim1','dim2'])
df1['label'] = 1

mean2 = np.array([25,25])
cov1 = np.array(([15,0.5],[0.5,15]))
data2 = np.random.multivariate_normal(mean2, cov1,100)
df2 = pd.DataFrame(data2, columns=['dim1','dim2'])
df2['label'] = 0

df = pd.concat((df1,df2))
plt.figure(figsize=(10,10),dpi=300)
sns.set_style('darkgrid')
print("df shape",df.shape)
p = sns.lmplot(data=df, x='dim1', y='dim2', fit_reg=False, hue='label', scatter_kws={'alpha':0.3})
line1 = lines.Line2D([20,40],[40,20],lw=1, color='black',alpha=0.5)
p.axes[0,0].add_line(line1)
line2 = lines.Line2D([25,35],[40,20],lw=1, color='black',alpha=0.5)
p.axes[0,0].add_line(line2)
line3 = lines.Line2D([20,40],[35,25],lw=1, color='black',alpha=0.5)
p.axes[0,0].add_line(line3)
plt.savefig('svm-lines.png',dpi=300)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.lines as lines
from sklearn import svm

np.random.seed(10)
cov1 = np.array(([5,1],[1,5]))
mean1 = np.array([35,35])
data1 = np.random.multivariate_normal(mean1, cov1,100)
df1 = pd.DataFrame(data1, columns=['dim1','dim2'])
df1['label'] = 1

mean2 = np.array([25,25])
cov1 = np.array(([5,0.5],[0.5,5]))
data2 = np.random.multivariate_normal(mean2, cov1,100)
df2 = pd.DataFrame(data2, columns=['dim1','dim2'])
df2['label'] = 0

df = pd.concat((df1,df2))

plt.figure(figsize=(10,10),dpi=300)
sns.set_style('darkgrid')
print("df shape",df.shape)
p = sns.lmplot(data=df, x='dim1', y='dim2', fit_reg=False, hue='label', scatter_kws={'alpha':0.3})
line1 = lines.Line2D([20,40],[40,20],lw=1, color='black',alpha=0.5)
p.axes[0,0].add_line(line1)
line2 = lines.Line2D([20,38],[37,20],lw=1, color='black',alpha=0.5)
p.axes[0,0].add_line(line2)
line3 = lines.Line2D([20,40],[35,25],lw=1, color='black',alpha=0.5)
p.axes[0,0].add_line(line3)
plt.savefig('svm-lines2.png',dpi=300)

df shape (200, 3)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.lines as lines
from sklearn import svm

np.random.seed(10)
cov1 = np.array(([5,1],[1,5]))
mean1 = np.array([35,35])
data1 = np.random.multivariate_normal(mean1, cov1,100)
df1 = pd.DataFrame(data1, columns=['dim1','dim2'])
df1['label'] = 1

mean2 = np.array([25,25])
cov1 = np.array(([5,0.5],[0.5,5]))
data2 = np.random.multivariate_normal(mean2, cov1,100)
df2 = pd.DataFrame(data2, columns=['dim1','dim2'])
df2['label'] = 0

df = pd.concat((df1,df2))

estimator = svm.SVC(kernel='linear')
classifier = estimator.fit(df.iloc[:,0:2],df['label'])

xx = np.linspace(10, 50, 50)
yy = np.linspace(10, 50, 50)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = classifier.decision_function(xy).reshape(XX.shape)

plt.figure(figsize=(10,10),dpi=300)
sns.set_style('darkgrid')
p = sns.lmplot(data=df, x='dim1', y='dim2', fit_reg=False, hue='label', scatter_kws={'alpha':0.3})
p.axes[0,0].contour(XX, YY, Z, colors='b', levels=[-1, 0, 1], alpha=0.3,
           linestyles=['--', '-', '--'])

p.axes[0,0].scatter(classifier.support_vectors_[:, 0], classifier.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='red')
plt.savefig('svm-lines3.png',dpi=300)

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
x = np.array([[1,2],[1,2],[9,10],[10,10]])
y = np.array([[1,2],[2,1],[1,2],[1,2]])
rbf_kernel(x,y, gamma=1)