# PCA for Data Visualization

In [None]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler,OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn import set_config
set_config(transform_output='pandas')
pd.set_option('display.max_columns',100)




In [None]:
df = pd.read_csv("Data/modified-Data_Cortex_Nuclear.csv")
# Dropping unique IDs
df = df.drop(columns=['MouseID'])
df.info()
df.head()



In [None]:
possible_targets = ['Type of Mouse',"Treatment",'Training','class',]
for col in possible_targets:
    print(f'\n- {col}:')
    print(df[col].value_counts())



In [None]:
# Define X and y
target = "Training"
X = df.drop(columns = possible_targets)
y = df[target]


In [None]:
import missingno as msno
msno.matrix(X)


In [None]:
# Fill in nulls with 0 and verify all nulls are addressed
X = X.fillna(0)
X.isna().sum().sum()



In [None]:
# Instantiate Standard Scaler
scaler = StandardScaler()
# Fit & transform data.
scaled_df = scaler.fit_transform(X)
scaled_df.head()


In [None]:
# Calculating the correlation matrix and plotting
corr = scaled_df.corr()
sns.heatmap(corr, cmap='coolwarm');



In [None]:
# Creating a scatter_matrix with pandas
pd.plotting.scatter_matrix(scaled_df, figsize=(40,40));



In [None]:
# randomly select features to plot
np.random.seed(42)
random_features = np.random.choice(scaled_df.columns,3)
# plot thee randomly selected features
sns.pairplot(scaled_df,  vars=random_features);



In [None]:
# Concatenate scaled features with target
plot_df = pd.concat([scaled_df, df[target]], axis=1)
# Plot with color coding based on target
g = sns.pairplot(data=plot_df,  vars=random_features, hue='Training')
g.fig.suptitle('Visualizing Raw Features - Colored by Training', y=1.01);



In [None]:
# Instantiate PCA to make 3 principal components
pca = PCA(n_components=3)
# Create and define the principal components
principal_components = pca.fit_transform(scaled_df)
# Preview the results
principal_components.head()



In [None]:
# Variance explained by each PC
pca.explained_variance_ratio_



In [None]:
# Sum of variance explained by 3 principal components
pca.explained_variance_ratio_.sum()



In [None]:
# Concatenate principal components with target
plot_df_pca = pd.concat([principal_components, df[target]], axis=1)
# Plot with color coding based on target
g_pca = sns.pairplot(data=plot_df_pca,  vars=principal_components.columns, hue='Training')
g_pca.fig.suptitle('Visualizing First 3 PCs - Colored by Training', y=1.01);



In [None]:
import plotly.express as px
import plotly.io as pio
# Make a 3d scatter plot with a PC on each axis and color by the target
fig = px.scatter_3d(plot_df_pca, x='pca0',y='pca1',z='pca2', width=800, height=600, color = "Training")
fig



In [None]:
def update_scatter3d(fig):
    fig.update_traces({'marker':{'size':3}})
    fig.show(config={'scrollZoom':False})
update_scatter3d(fig)



In [None]:
# See avaialbe templates
pio.templates



In [None]:
# Make a 3d scatter plot with a PC on each axis and color by the target
# Change template style to plotly_dark
fig = px.scatter_3d(plot_df_pca, x='pca0',y='pca1',z='pca2', width=800, height=600, color = "Training", template = 'plotly_dark')
update_scatter3d(fig)


# PCA to Speed up Machine Learning Algorithms

## PCA for Supervised Machine Learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pd.set_option('display.max_columns',200)
pd.set_option("display.max_info_rows", 800)
pd.set_option('display.max_info_columns',800)

from sklearn import set_config
set_config(transform_output='pandas')


In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def classification_metrics(y_true, y_pred, label='',
                           output_dict=False, figsize=(8,4),
                           normalize='true', cmap='Blues',
                           colorbar=False):
    # Get the classification report
    report = classification_report(y_true, y_pred)
    ## Print header and report
    header = "-"*70
    print(header, f" Classification Metrics: {label}", header, sep='\n')
    print(report)
    ## CONFUSION MATRICES SUBPLOTS
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    # create a confusion matrix  of raw counts
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=None, cmap='gist_gray', colorbar=colorbar,
                ax = axes[0],);
    axes[0].set_title("Raw Counts")
    # create a confusion matrix with the test data
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=normalize, cmap=cmap, colorbar=colorbar,
                ax = axes[1]);
    axes[1].set_title("Normalized Confusion Matrix")
    # Adjust layout and show figure
    fig.tight_layout()
    plt.show()
    # Return dictionary of classification_report
    if output_dict==True:
        report_dict = classification_report(y_true, y_pred, output_dict=True)
        return report_dict


def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
    # Get predictions for training data
    y_train_pred = model.predict(X_train)
    # Call the helper function to obtain regression metrics for training data
    results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
    print()
    # Get predictions for test data
    y_test_pred = model.predict(X_test)
    # Call the helper function to obtain regression metrics for test data
    results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data' )
    if output_dict == True:
        # Store results in a dataframe if ouput_frame is True
        results_dict = {'train':results_train,
                    'test': results_test}
        return results_dict



In [None]:
# Get Data
df = pd.read_csv('Data/pd_speech_features.csv')
df.head()


In [None]:
df.dtypes.value_counts()


In [None]:
df.isna().sum().sum()


In [None]:
# Define target and cols to drop
target_col = 'class'
drop_cols = ['id']
# Define X and y
y = df[target_col].copy()
X = df.drop(columns=[target_col,*drop_cols]).copy()



In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=321)
X_train.head()



In [None]:
# Instantiate Standard Scaler
scaler = StandardScaler()
# Fit & transform data.
X_train_tf = scaler.fit_transform(X_train)
X_test_tf = scaler.transform(X_test)



In [None]:
# Check for class balance of target
y_train.value_counts(1)



In [None]:
from imblearn.over_sampling import SMOTE,SMOTENC
smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train_tf, y_train)
y_train_sm.value_counts()



In [None]:
# Import datetime library
import datetime as dt

# Record the start time
start = dt.datetime.now()

clf = RandomForestClassifier(random_state=42)

clf.fit(X_train_sm, y_train_sm)

# Record the end time and calc duration
end = dt.datetime.now()
dur_baseline = end-start

evaluate_classification(clf, X_train_sm,y_train_sm, X_test_tf, y_test)
print(f'Training time was: {dur_baseline}')



### PCA

In [None]:
# Instantiate & fit data using PCA
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_sm)
X_test_pca = pca.transform(X_test_tf)
X_train_pca.head()



In [None]:
# Record the start time
start = dt.datetime.now()

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_pca, y_train_sm)

# Record the end time and calc duration
end = dt.datetime.now()
dur_pca = end-start

evaluate_classification(clf, X_train_pca,y_train_sm, X_test_pca, y_test)
print(f'Training time was: {dur_pca}')


In [None]:
# Compare speeds before and after PCA
compare_speed = dur_baseline/dur_pca
print(f"Using PCs was {compare_speed:.2f} times faster!")



In [None]:
# Initialize and fit PCA
pca = PCA()
pca.fit(X_train_sm, y_train)



In [None]:
# Determine how much variance is explained by each PC
explained = pd.Series(pca.explained_variance_ratio_, name='Explained Variance Ratio')
explained



In [None]:
# plot the cumulative sum of the percentage of explained variance for each component and those before it.
ax = explained.cumsum().plot(marker='.')
# add a line to mark .9 (or 90%) variance explained
ax.axhline(.9, color='k');



### Specifying the Explained Variance
Rather than specifying the number of components to return, an alternate method is to specify the minimum proportion of explained variance you are willing to accept. PCA will automatically reduce the number of components just enough to meet your specification.

To specify the proportion of variance, give the n_components argument a float between 0 and 1, and it will return the number of components required to explain the given variance.

The code below is an example of how to ensure that enough components are returned to explain 85% of the variance. Instead of using an integer in the n_components argument to designate the number of components, we use a decimal value to indicate the amount of variance to be explained. PCA() will then automatically use enough principal components to meet this level of explanation.

In [None]:
# Define PCA to address 85% of the variance
pca85 = PCA(n_components=.85)
# fit and transform on training data
X_train_pca85 = pca85.fit_transform(X_train_sm)
# transform test data
X_test_pca85 = pca85.transform(X_test_tf)
# obtain the number of PCs used
pca85.n_components_



In [None]:
# Record the start time
start = dt.datetime.now()

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_pca85, y_train_sm)

# Record the end time and calc duration
end = dt.datetime.now()
dur_pca_85 = end-start

evaluate_classification(clf, X_train_pca85,y_train_sm, X_test_pca85, y_test)
print(f'Training time was: {dur_pca_85}')



In [None]:
compare_speed = dur_baseline/dur_pca_85
print(f"Using PC's with .85 was {compare_speed:.2f} times faster!")



# Feature Engineering: Overloaded Operators

In [None]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vReZBM5OC6GLYbacisp_ToNiu3CLWxqPXw7mWBsdRjnYOFLWNufdQ4qd8u5qTzUF2_sBUAMEi5cgy1U/pub?gid=1040198428&single=true&output=csv')
df.head()


In [None]:
#Summing Features:
df['TotalFamily'] = df['SibSp'] + df['Parch'] 
df = df.drop(['SibSp', 'Parch'], axis=1)
df.head()


In [None]:
# Concatenating Features:

df['Age'] = df['Age'].round(-1)
df.head()


In [None]:
df['GenderAge'] = df['Sex'] + df['Age'].astype('string')
df.drop(columns=['Sex','Age'], inplace=True)
df.head()


In [None]:
# Squaring and Multiplying Features

df['NormedFare'] = df['Fare'] * df['Pclass']**2
df.drop(columns='Fare', inplace=True)
df.head()


# Feature Engineering: Strings

In [None]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vReZBM5OC6GLYbacisp_ToNiu3CLWxqPXw7mWBsdRjnYOFLWNufdQ4qd8u5qTzUF2_sBUAMEi5cgy1U/pub?gid=1040198428&single=true&output=csv')
df.head()


In [None]:
# Separating Features: Strings

# create 2 new columns, FirstName and LastName by splitting the Name column
df[['LastName','FirstName']] = df['Name'].str.split(',', expand=True)
# drop the 'Name' column
df.drop('Name', axis=1, inplace=True)
df.head()


In [None]:
# Clean-up

df.loc[0,'FirstName']


In [None]:
df['FirstName'] = df['FirstName'].str.strip()
df.loc[0, 'FirstName']


In [None]:
# Combining Strings

df['Name'] = df['FirstName'] + ' ' + df['LastName']
df.drop(columns=['LastName','FirstName'], inplace= True)
df.head()


# Feature Engineering: Datetime

In [None]:
import pandas as pd
df2 = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSrgrUnz8mdosU-_k0aECouymqwds_mlaHpYlXzRtf7MBJ4N1r1inCfSDebaXwTVfLtH133EhwKf3mi/pub?gid=394699239&single=true&output=csv',                  usecols=['date','price','bedrooms','bathrooms'])
df2.head()



In [None]:
df2['date'] = pd.to_datetime(df2['date'])
df2.info()



In [None]:
df2['year'] = df2['date'].dt.year
df2['month (numeric)'] = df2['date'].dt.month
df2['month (name)'] = df2['date'].dt.month_name()
df2['day of month'] = df2['date'].dt.day
df2['day of week (numeric)'] = df2['date'].dt.weekday
df2['day of week (name)'] = df2['date'].dt.day_name()
df2.head()



# Feature Engineering: Functions

In [None]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vReZBM5OC6GLYbacisp_ToNiu3CLWxqPXw7mWBsdRjnYOFLWNufdQ4qd8u5qTzUF2_sBUAMEi5cgy1U/pub?gid=1040198428&single=true&output=csv')
df.head()


In [None]:
# find the median fare price
median_fare = df['Fare'].median()
# define a function that returns 'Expensive' or 'Cheap'
def bin_fare(fare):
    if fare > median_fare:    
        return 'Expensive'  
    else:    
        return 'Cheap'



In [None]:
# apply bin_fare() function to each item in the 'Fare' column
df['Fare'] = df['Fare'].apply(bin_fare)
df.head()



In [None]:
df['Age'] = df['Age'].apply(lambda x: 'elderly' if x > 30 else 'young')
df.head()
