In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
!pip install feature-engine


In [None]:
from sklearn.decomposition import PCA

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

data = load_breast_cancer()
df_clf = pd.DataFrame(data.data,columns=data.feature_names)
df_clf['diagnostic'] = pd.Series(data.target)
df_clf = df_clf.sample(frac=0.6, random_state=101)
# add some missing data
df_clf.iloc[:10,4] = np.NaN

print(df_clf.shape)
df_clf.head()

In [None]:
df_target = df_clf[['diagnostic']]
X = df_clf.drop(['diagnostic'], axis=1)
print(X.shape)
X.head(3)

In [None]:
from sklearn.pipeline import Pipeline
### Data Cleaning
from feature_engine.imputation import MeanMedianImputer
### Feat Scaling
from sklearn.preprocessing import StandardScaler


def PipelineDataCleaningFeatEngFeatScaling():
  pipeline_base = Pipeline([
                            
      ( 'MeanMedianImputer', MeanMedianImputer(imputation_method='median') ),

      ( 'feature_scaling', StandardScaler() ),
  ])

  return pipeline_base

PipelineDataCleaningFeatEngFeatScaling()

In [None]:
pipeline_pca = PipelineDataCleaningFeatEngFeatScaling()
df_pca = pipeline_pca.fit_transform(X)
print(df_pca.shape,'\n', type(df_pca))

In [None]:
df_pca

In [None]:
import numpy as np
from sklearn.decomposition import PCA # import PCA from sklearn

n_components = 30 # set the number of components as all columns in the data

pca = PCA(n_components=n_components).fit(df_pca)  # set PCA object and fit to the data
x_PCA = pca.transform(df_pca) # array with transformed PCA


# the PCA object has .explained_variance_ratio_ attribute, which tells 
# how much information (variance) each component has 
# We store that to a DataFrame relating each component to its variance explanation
ComponentsList = ["Component " + str(number) for number in range(n_components)]
dfExplVarRatio = pd.DataFrame(
    data= np.round(100 * pca.explained_variance_ratio_ ,2),
    index=ComponentsList,
    columns=['Explained Variance Ratio (%)'])

# prints how much of the dataset these components explain (naturally in this case will be 100%)
PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum()

print(f"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \n")
print(dfExplVarRatio)

In [None]:
# same code just changed the number of components
n_components = 7

pca = PCA(n_components=n_components).fit(df_pca)
x_PCA = pca.transform(df_pca) # array with transformed PCA

ComponentsList = ["Component " + str(number) for number in range(n_components)]
dfExplVarRatio = pd.DataFrame(
    data= np.round(100 * pca.explained_variance_ratio_ ,2),
    index=ComponentsList,
    columns=['Explained Variance Ratio (%)'])

PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum()

print(f"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \n")
print(dfExplVarRatio)

In [None]:
print(x_PCA.shape)
x_PCA

## Visualise data after PCA transformation

In [None]:
var1, var2 = 'mean concavity' , 'mean concave points'
sns.scatterplot(x=X[var1], y=X[var2], hue=df_target['diagnostic'])
plt.xlabel(var1)
plt.ylabel(var2)
plt.show()

In [None]:
sns.scatterplot(x=x_PCA[:,0], y=x_PCA[:,1])
plt.xlabel('Component 0')
plt.ylabel('Component 1')
plt.show()

In [None]:
import plotly.express as px
fig = px.scatter_3d(x=x_PCA[:,0], y=x_PCA[:,1], z= x_PCA[:,2] , color=df_target['diagnostic'],
                    labels=dict(x="Component 0", y="Component 1", z='Component 2'),
                    color_continuous_scale='spectral',
                    width=750, height=500)
fig.update_traces(marker_size=5)
fig.show()