In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, IncrementalPCA

In [2]:
# import source data
wine = "wine.csv"
df_wine = pd.read_csv(wine)

# Drop Null values
# df_wine.drop(['region_2','country'], axis = 1, inplace = True)
df_wine = df_wine.dropna().drop_duplicates().reset_index(drop=True)
df_wine.head()

Unnamed: 0,points,designation,price,province,region_1,region_2,variety,winery,country
0,96,Martha's Vineyard,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,US
1,96,Special Selected Late Harvest,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,US
2,96,Reserve,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,US
3,95,Silice,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,BergstrÌ¦m,US
4,95,Gap's Crown Vineyard,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm,US


In [3]:
df_wine_dummies=pd.get_dummies(df_wine, columns=['designation','province','region_1','region_2','variety','winery','country'],drop_first=True)
df_wine_dummies.head()

Unnamed: 0,points,price,designation_#50 Mon Chou,designation_'72,designation_'Nearly Naked',designation_'S',designation_*%#&@!,designation_**1 Liter**,designation_0 Degree Dry,designation_0 Degrees,...,winery_ecoVINO,winery_flipflop,winery_j.brix,winery_kukkula,winery_l'homme qui ris,winery_love & squalor,winery_un4seen,winery_ÌÊMaurice,winery_Ìäcluse,winery_ÌälevÌ©e Winegrowers
0,96,235.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,96,90.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,96,65.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,95,65.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,95,60.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
y = df_wine_dummies["points"]
y=y.as_matrix()
y

array([96, 96, 96, ..., 84, 84, 84], dtype=int64)

In [5]:
X = df_wine_dummies
X.drop(["points"], axis = 1, inplace = True)

In [6]:
column_count=len(X.columns.tolist())
column_count

14680

In [7]:
X=X.as_matrix()
X

array([[ 235.,    0.,    0., ...,    0.,    0.,    0.],
       [  90.,    0.,    0., ...,    0.,    0.,    0.],
       [  65.,    0.,    0., ...,    0.,    0.,    0.],
       ..., 
       [  25.,    0.,    0., ...,    0.,    0.,    0.],
       [  27.,    0.,    0., ...,    0.,    0.,    0.],
       [  15.,    0.,    0., ...,    0.,    0.,    0.]])

In [8]:
n_components = 2
ipca = IncrementalPCA(n_components=n_components, batch_size=10)
X_ipca = ipca.fit_transform(X)

pca = PCA(n_components=n_components)
#pca = PCA()
X_pca = pca.fit_transform(X)

In [9]:
print("mean = "+str(pca.mean_))
print("components = "+str(pca.components_))
print("explained variance = "+str(pca.explained_variance_))
print("explained variance ratio = "+str(pca.explained_variance_ratio_))
print("singular values = "+str(pca.singular_values_))
print("n components = "+str(pca.n_components_))
print("noise variance = "+str(pca.noise_variance_))

mean = [  3.82117590e+01   4.05202804e-05   4.05202804e-05 ...,   4.05202804e-04
   2.02601402e-04   8.10405608e-05]
components = [[  9.99974882e-01   8.60029477e-07  -1.03502773e-06 ...,   2.04070756e-06
    5.60012145e-07   6.95423932e-07]
 [  1.63140334e-03   1.16499142e-05  -1.86992096e-05 ...,   1.40288914e-03
   -1.70378090e-04  -1.11301995e-04]]
explained variance = [  7.91307080e+02   3.50772529e-01]
explained variance ratio = [  9.93853031e-01   4.40557591e-04]
singular values = [ 4419.03565528    93.03958547]
n components = 2
noise variance = 0.000309541662639


In [15]:
colors = ['navy', 'turquoise', 'darkorange']

for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
    plt.figure(figsize=(8, 8))
    
    #for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
    for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
        plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
                    color=color, lw=2, label=target_name)

    if "Incremental" in title:
        err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
        plt.title(title + " of iris dataset\nMean absolute unsigned error "
                  "%.6f" % err)
    else:
        plt.title(title + " of iris dataset")
    plt.legend(loc="best", shadow=False, scatterpoints=1)
    plt.axis([-4, 4, -1.5, 1.5])

plt.show()

NameError: name 'iris' is not defined