In [0]:
import warnings
warnings.filterwarnings("ignore")

### Feature Reduction - Use Prince package

#### PCA to reduce dimension for continous data

In [0]:
import prince

def reduceFeaturePCA(df, df_un, method):
    
    print("start PCA ... ")
    start = time.time()
    
    pca = prince.PCA(n_components=len(list(df)),n_iter=3,rescale_with_mean=True,rescale_with_std=True,copy=True,check_input=True,engine='auto',random_state=42)
    pca = pca.fit(df)    
    e_vals = pca.eigenvalues_
    
    exp_val = [(i/sum(e_vals))*100 for i in sorted(e_vals,reverse=True)]
    cs_exp_val = np.cumsum(exp_val)

    percentages = [10,35,55,80,95,99]
    picked_components = []

    for p in percentages:
        picked_components.append(np.argmax(cs_exp_val>p))
    print(picked_components)

    
    # plot cumulated sum of eigenvalues percentage
    # index starts from 0,so plus 1 to get the number of components
    x = np.arange(1, len(e_vals)+1)
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, np.real(cs_exp_val))
    plt.scatter(x[picked_components], cs_exp_val[picked_components], s = 50,marker = 'o')
    for a, b in zip(x[picked_components], cs_exp_val[picked_components]):
        plt.text(a-0.1, b+0.1, '%.0f' % a,verticalalignment = 'bottom', horizontalalignment = 'right', fontsize = 16)
    plt.title('Remained variance by different principal components')
    plt.ylabel('Remained variance in percent')
    plt.xlabel('Picked principle conponents')
    plt.show()    
    
    X = pca.transform(df)
    X_pca = X.iloc[:,0:picked_components[percentages.index(99)]]    
    
    return X_pca

#### MCA to reduce dimension for categorical data

In [0]:
import prince

def reduceFeatureMCA(df, df_un, method):
    
    print("start MCA ... ")
    start = time.time()

    mca = prince.MCA(n_components=len(list(df)),n_iter=3,copy=True,check_input=True,engine='auto',random_state=42)
    mca = mca.fit(df)
    e_vals = mca.eigenvalues_
    
    exp_val = [(i/sum(e_vals))*100 for i in sorted(e_vals,reverse=True)]
    cs_exp_val = np.cumsum(exp_val)

    percentages = [10,35,55,80,95,99]
    picked_components = []

    for p in percentages:
        picked_components.append(np.argmax(cs_exp_val>p))
    print(picked_components)

    X = mca.transform(df)
    X_mca = X.iloc[:,0:picked_components[percentages.index(99)]]    
    
    return X_mca

### Feature Reduction - Manual calculate PCA and MCA

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

import matplotlib.dates as mdates
from matplotlib.dates import MONDAY, DateFormatter, DayLocator, WeekdayLocator
from datetime import timedelta
from mpl_finance import candlestick2_ohlc, candlestick_ohlc
import matplotlib.ticker as ticker

import plotly
import plotly.plotly as py
import plotly.graph_objs as go

def plot_zz_cp(df_price,title,frequency,line1,line2,line3,line4):
    
  # tick for daily data
  mondays = WeekdayLocator(MONDAY)        
  alldays = DayLocator()              
  weekFormatter = DateFormatter('%b %d')  
  
  # tick for hourly data
  date_fmt = mdates.DateFormatter('%d/%b')
  hour_fmt = mdates.DateFormatter('%H:%M')

  fig, ax = plt.subplots()
  fig.subplots_adjust(bottom=0.5)

  if 'd' in frequency or 'D' in frequency: 
    ax.xaxis.set_major_locator(mondays)
    ax.xaxis.set_minor_locator(alldays)
    ax.xaxis.set_major_formatter(weekFormatter)    
    w=0.6
  else: 
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=1))
    ax.xaxis.set_major_formatter(date_fmt)
    ax.xaxis.set_minor_locator(mdates.HourLocator(interval=4))
    ax.xaxis.set_minor_formatter(hour_fmt)
    ax.xaxis.set_tick_params(which='major', pad=15) 
    w=0.006 
  
  candlestick_ohlc(ax, zip(mdates.date2num(df_price.index.to_pydatetime()),
                       df_price['open'], df_price['high'],
                       df_price['low'], df_price['close']),
               width=w,colordown='pink',colorup='lightgreen')

  
  ax.xaxis_date()
  ax.autoscale_view()
  fig.set_size_inches(25,20)
  plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
  plt.xticks(rotation=45)
  ax.set_title(title)
  ymin, ymax = ax.get_ylim()
  #ax.set_ylim([0.5,1.2])
  
  plt.plot(df_price['zzValue84'], color='olive', linestyle='None', marker='o', markersize=12)
  
  if line1 == '': 
    plt.plot(df_price['zz84'], color='olive', label='zz', linestyle='dotted', linewidth=2) #, marker='o', markersize=12)
  if line1 != '':
    plt.plot(df_price[line1], color='black', label=line1, linewidth=2.5)
  if line2 != '':
    plt.plot(df_price[line2], color='cyan', label=line2, linestyle='dashed', linewidth=2)
  if line3 != '':
    plt.plot(df_price[line3], color='purple', label=line3, linestyle='dashed', linewidth=2)
  if line4 != '':
    plt.plot(df_price[line4], color='red', label=line4, linestyle='dashed', linewidth=2)
  
  plt.legend(loc='best')
  
  plt.show()