In [None]:
import pandas as pd
pd.Series
import math
import csv
from datetime import datetime
import numpy as np
import scipy as sc
import statsmodels
import sklearn
from sklearn import preprocessing
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 30, 10
rcParams.update({'font.size': 22})
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from matplotlib import pyplot
import statsmodels.api as sm

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression

### Load file, print info and select columns

In [None]:
#function to load files
def load_file(filepath):
    df = pd.read_csv(filepath, sep='\t', index_col=0, parse_dates=True)
    df = df.sort_index()
    #we can check that this 2 columns are equal, so we can drop one
    #any(df['SALE_AMOUNT_BEFORE_CANCELLATIONS'] != df['SALE_AMOUNT_AFTER_CANCELLATIONS'])
    df = df.drop(['SALE_AMOUNT_AFTER_CANCELLATIONS'], axis=1)
    df = df.rename(columns={'SALE_AMOUNT_BEFORE_CANCELLATIONS': 'SALE_AMOUNT'})
#...
    return df.astype('float32')

#function to create a new df with selected columns
def create_small_df(df, columns):
    small_df = df.copy()
    small_df = small_df[columns]
    return small_df

#function to print inf about Data
def print_info_df(df, print_columns = False):
    #Count period
    d1 = df.index[0]
    d2 = df.index[-1]
    delta = d2 - d1
    print('Number of days is ' + str(delta.days) + ' from ' + str(d1) + ' to '+ str(d2))
    print('The shape of the data: %d*%d' %(df.shape[0],df.shape[1]))
    print('Check for Nan values: %s'%(df.isnull().values.any()))
    if (print_columns == True):
        print(list(df.columns))
    else:
        print('Number of columns: %d'%(df.shape[1]))

# Choose data! 
## Choose feature! 

In [None]:
df = load_file('..')
print_info_df(df, False)

feature = 'SALE_AMOUNT'

## Make it Normalized! [0;1]

In [None]:
def minmax_scaler(df):
    scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    new_df = df.copy()
    new_df[new_df.columns] = scaler.fit_transform(new_df[new_df.columns])
    return new_df, scaler

def minmax_unscaler(df, scaler):
    new_df = df.copy()
    new_df[new_df.columns] = scaler.inverse_transform(new_df[new_df.columns])
    return new_df

In [None]:
scaled_df, scaler = minmax_scaler(df)

## Rename scaled_df -> df 

In [None]:
df = scaled_df

### Ranking methods

In [None]:
#create table for results
cols = ['features', 'f_regression', 'mutual_f_regression']
score_df = pd.DataFrame([], columns=cols)
score_df.head() 

In [None]:
def calc_f_regression(df, select_n=10):
    array = df.values
    X = array[:,1:]
    Y = array[:,0]
    list_features = df.columns[1:]
    columns = df.columns
    test = SelectKBest(score_func=f_regression, k=select_n)
    fit = test.fit(X, Y)
    # summarize scores
    np.set_printoptions(precision=3)
    list_scores = fit.scores_
    #features = fit.transform(X)
    return list_features, list_scores

def calc_mutual_f_regression(df, select_n=10):
    array = df.values
    X = array[:,1:]
    Y = array[:,0]
    list_features = df.columns[1:]
    columns = df.columns
    test = SelectKBest(score_func=mutual_info_regression, k=select_n)
    fit = test.fit(X, Y)
    # summarize scores
    np.set_printoptions(precision=3)
    list_scores = fit.scores_
    #features = fit.transform(X)
    return list_features, list_scores

In [None]:
list_features, scores_f_regression = calc_f_regression(df, 5)
scores_f_mutual = calc_mutual_f_regression(df, 5)[1]
score_df['features'] = list_features
score_df['f_regression'] = [round(a,2) for a in scores_f_regression]
score_df['mutual_f_regression'] = [round(a,2) for a in scores_f_mutual]

In [None]:
score_df.sort_values('f_regression',ascending = False).head(10)[['features','f_regression']].reset_index(drop=True)

In [None]:
score_df.sort_values('mutual_f_regression',ascending = False).head(10)[['features','mutual_f_regression']].reset_index(drop=True)

### Correlation

In [None]:
df.corr()[feature].sort_values()

### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

def forest_regression(df, show_bar=True):
    dataframe = df.copy()
    array = dataframe.values
    # split into input and output
    X = array[:,1:]
    y = array[:,0]
    # fit random forest model
    model = RandomForestRegressor(n_estimators=500, random_state=1)
    model.fit(X, y)
    # show importance scores
    l = model.feature_importances_
    # plot importance scores
    names = dataframe.columns.values[0:-1]
    if (show_bar == True):
        plt.rcdefaults()
        fig, ax = plt.subplots()
        y_pos = np.arange(len(names))
        performance = l
        ax.barh(y_pos, performance)
        ax.set_yticks(y_pos)
        ax.set_yticklabels(names)
        ax.invert_yaxis() 
        ax.set_xlabel('Importance')
        ax.set_title('RandomForestRegressor')
        plt.show()
    return names, l

In [None]:
def view_table_results(names, l, df):
    results_df = pd.DataFrame(columns=['feature', 'result'])
    #n = list(df.columns)*5
    for i in range(len(names)):
        results_df.loc[i] = ['%s'%(names[i]), l[i]]
    return results_df.sort_values('result', ascending=False).reset_index(drop=True)

In [None]:
names, res = forest_regression(df)
table = view_table_results(names, res, df)
table[:5]

### RFE + Random Forest

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

def RFE_forest(diff_df):
    array = diff_df.values
    X = array[:,1:]
    y = array[:,1]
    # perform feature selection
    rfe = RFE(RandomForestRegressor(n_estimators=500, random_state=1), 10)
    fit = rfe.fit(X, y)
    # report selected features
    print('Selected Features:')
    names = diff_df.columns.values[0:-1]
    for i in range(len(fit.support_)):
        if fit.support_[i]:
            print(names[i])
    # plot feature rank
    names = diff_df.columns.values[0:-1]
    plt.rcdefaults()
    fig, ax = plt.subplots()
    y_pos = np.arange(len(names))
    performance = fit.ranking_
    ax.barh(y_pos, performance)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(names)
    ax.invert_yaxis() 
    ax.set_xlabel('Importance')
    ax.set_title('RandomForestRegressor')
    plt.show()
    return names, fit.ranking_

In [None]:
names, ranking = RFE_forest(df) 