In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.pipeline import make_pipeline

In [3]:
FIG_SIZE=(10,7)

In [4]:
data=pd.read_excel('dataset/Fasle_Bahar_94_Pasargad.xls')

In [5]:
Y=data['Sum']
X=data[['Sesion','Month','DayOfNumber','DayOfWeek','OnOffDay','Hour']]

In [6]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.3,random_state=42)

In [7]:
unscaled_clf=make_pipeline(PCA(n_components=2),GaussianNB())

In [8]:
unscaled_clf.fit(Xtrain,Ytrain)

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('gaussiannb', GaussianNB(priors=None))])

In [9]:
pred_test=unscaled_clf.predict(Xtest)

In [10]:
std_clf=make_pipeline(StandardScaler(),PCA(n_components=2),GaussianNB())

In [11]:
std_clf.fit(Xtrain,Ytrain)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('gaussiannb', GaussianNB(priors=None))])

In [12]:
pred_test_std=std_clf.predict(Xtest)

In [13]:
print('\nPrediction accuracy for the normal test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(Ytest, pred_test)))

print('\nPrediction accuracy for the standardized test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(Ytest, pred_test_std)))


Prediction accuracy for the normal test dataset with PCA
3.43%


Prediction accuracy for the standardized test dataset with PCA
2.84%



In [14]:
# Extract PCA from pipeline
pca = unscaled_clf.named_steps['pca']
pca_std = std_clf.named_steps['pca']

# Show first principal componenets
print('\nPC 1 without scaling:\n', pca.components_[0])
print('\nPC 1 with scaling:\n', pca_std.components_[0])

# Scale and use PCA on X_train data for visualization.
scaler = std_clf.named_steps['standardscaler']
Xtrain_std = pca_std.transform(scaler.transform(Xtrain))

# visualize standardized vs. untouched dataset with PCA performed
# fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)


PC 1 without scaling:
 [  0.00000000e+00  -1.97945739e-05   9.99743191e-01   1.85070812e-03
   1.84872875e-03   2.25102135e-02]

PC 1 with scaling:
 [ 0.          0.00076596 -0.10426755  0.69513985 -0.70551607  0.09030699]


In [20]:
# for l, c, m in zip(range(0, 5), ('blue', 'red', 'green'), ('^', 's', 'o')):
#     ax1.scatter(Xtrain[Ytrain == l, 0], Xtrain[Ytrain == l, 1],
#                 color=c,
#                 label='class %s' % l,
#                 alpha=0.5,
#                 marker=m
#                 )

# for l, c, m in zip(range(0, 5), ('blue', 'red', 'green'), ('^', 's', 'o')):
#     ax2.scatter(Xtrain_std[Ytrain == l, 0], Xtrain_std[Ytrain == l, 1],
#                 color=c,
#                 label='class %s' % l,
#                 alpha=0.5,
#                 marker=m
#                 )

# ax1.set_title('Training dataset after PCA')
# ax2.set_title('Standardized training dataset after PCA')

# for ax in (ax1, ax2):
#     ax.set_xlabel('1st principal component')
#     ax.set_ylabel('2nd principal component')
#     ax.legend(loc='upper right')
#     ax.grid()

# plt.tight_layout()

# plt.show()