In [1]:
%%html
<style>
.cm-s-ipython .CodeMirror-matchingbracket { color: LimeGreen !important;}
</style>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import pprint
from scipy import stats
import random

In [3]:
from sklearn import preprocessing

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
bisco= "Bisco's Collar Gold Amulet"

# Functions

In [7]:
def pre_process_df(df):
    
    #Drop unwanted columns
    df_dropped = df.drop(columns=['Unnamed: 0',
                                  'stash_id','stash_feed',
                                  '_id',
                                  'date_month','date_year','date',
                                  'league','rarity',
                                  'price_currency','price_raw',
                                  'time_minutes','time'])
    
    # delete columns with no values in them, which means only zero
    proc_df = df_dropped.loc[:,(df!=0).any (axis=0)]
    
    # Count the number of each different unique name (and change to column name to count)
    proc_df['count'] = proc_df.groupby('item_name')['item_name'].transform('count')
    
    # Clean up of data. 
    ## Prices were imported as 'Object' not 'float'. We need to convert them to float.
    proc_df[['price_amount']] = pd.to_numeric(proc_df['price_amount'],errors='coerce')
    
    # Remove rows where price_amount is NaN
    proc_df = proc_df[pd.notnull(proc_df['price_amount'])]
    
    return proc_df

def split_df_to_unique_item_names(df,days_to_check=[4,5,6,7,8,9,10]):

    unique_item_names = df['item_name'].value_counts(ascending=False)

    ascending_dataframes_per_name = {}

    for item_name in unique_item_names.index:
        dataF = df.loc[(df['item_name']== item_name) & (df['date_day'].isin(days_to_check))]
        if dataF.empty: continue
        if item_name not in ascending_dataframes_per_name:    
            ascending_dataframes_per_name[item_name] = dataF

    for item_name in ascending_dataframes_per_name:
        dataF = ascending_dataframes_per_name[item_name]
        dataF = dataF.loc[:,(dataF!=0).any(axis=0)]
        mask = dataF['price_amount'].notna()
        dataF = dataF[mask]
        ascending_dataframes_per_name[item_name] = dataF.reset_index()
        
    return ascending_dataframes_per_name

def compute_corr(df,method='spearman',filename=''):
    
    min_periods = int(len(df))*0.1
    cols = list(df.filter(regex='(Attacks per Second|Energy Shield|Elemental Damage|Critical Strike Chance|Physical Damage|influence|Armour|sockets_number|linked_sockets|Evasion Rating)|(?=^co_|ex_|im_|en_$)(^.*$)').columns.values)
    df[cols] = df[cols].replace({0:np.nan, 0.0:np.nan})
    #df[df.filter(regex='(?=^co_|ex_|im_|en_$)(^.*$)') <= 0.0] = np.nan
    corr = df.corr(method,min_periods = min_periods)
    corr = corr.dropna('columns',how='all')
    corr = corr.dropna('rows',how='all')
    
    return corr

def remove_outliers_IQR(item_dataframe,column_label = 'price_amount',high_quantile=0.75):
    '''Function removes outliers from a dataframe along the price_amount column by default.

    Input:
        df: pandas DataFrame
        column_label: along which column to check for outliers(default = 'price_amount')
        high_quantile: high-end quantile to use on boxplot'''
    
    Q1 = item_dataframe[column_label].quantile(1-high_quantile)
    Q3 = item_dataframe[column_label].quantile(high_quantile)
    IQR = Q3 - Q1
    new_df = item_dataframe[~((item_dataframe[column_label] < (Q1 - 1.5 * IQR))|(item_dataframe[column_label] > (Q3 + 1.5 * IQR)))]
    
    return new_df

def remove_outliers_zscore(item_dataframe,column_labels = ['price_amount'],threshold=3,show_results=False):
    '''Function removes outliers using z-score from a dataframe along the price_amount column by default.

    Input:
        df: pandas DataFrame
        column_label: along which columns to check for outliers(default = ['price_amount'])
        show_results: show results before and after removing outliers(default = False)
        size: vertical and horizontal size of the plot'''
    
    z_score = np.abs(stats.zscore(item_dataframe[column_labels]))
    new_df = item_dataframe[(z_score < threshold)]
    if show_results:
        data_outliers_index = np.where(z_score > threshold)[0]
        print('Data outliers for "{}":'.format(item_dataframe['item_name'][0]))
        for id in data_outliers_index:
              print('index: {:<10d}{}: {:<10f}'.format(id,column_labels[0],item_dataframe.iloc[id][column_labels[0]]))
        print('Removed {} rows'.format(item_dataframe.shape[0]-new_df.shape[0]))
    return new_df


def produce_decision_dataframe(item_df,correlations_df=pd.DataFrame,incl_outliers=True,method='z-score',threshold=2,quantile=0.8):
    
    d_df = item_df
    
    if not incl_outliers:
        if method == 'z-score' : 
            d_df = remove_outliers_zscore(d_df,threshold=threshold)
        elif method == 'IQR' :
            d_df = remove_outliers_IQR(d_df,high_quantile=quantile)
        else:
            raise Exception('\t\tWrong outlier mode. Valid options mode = [z-score | IQR]')
    
    if correlations_df.empty:
        columns = ['item_name','feature','corr_value','no_features','transactions','st_div','variance']
        correlations_df =  pd.DataFrame(columns=columns)
    
    corr=compute_corr(d_df,method='kendall')
    #corr_filtered = corr['price_amount'].filter(regex='(item_category|corrupted|Attacks per Second|Energy Shield|Elemental Damage|Critical Strike Chance|Physical Damage|influence|Armour|sockets_number|linked_sockets|Quality|Evasion Rating)|(?=^co_|ex_|im_|en_$)(^.*$)').drop(labels=['ex_conv_rate'],axis=0).dropna()
    corr_filtered = corr['price_amount'].filter(regex='(date_day|item_category|corrupted|Attacks per Second|Energy Shield|Elemental Damage|Critical Strike Chance|Physical Damage|influence|Armour|sockets_number|linked_sockets|Quality|Evasion Rating)|(?=^co_|ex_|im_|en_$)(^.*$)').dropna()
    for row in corr_filtered.index:
        correlations_df = correlations_df.append({'item_name':d_df['item_name'].unique()[0],
                                'feature':row,
                                'corr_value': corr_filtered[row],
                                'no_features':len(corr_filtered),
                                'transactions':d_df.groupby('item_name')['item_name'].count().values[0],
                                'st_div':d_df['price_amount'].describe()['std'],
                                'variance':d_df[['price_amount']].var(axis=0)},ignore_index=True)
    
    return correlations_df

def produce_corr_based_df(df_per_item_name,method='z-score',threshold=2,quantile=0.8):
    
    columns = ['item_name','feature','corr_value','no_features','transactions','st_div','variance']

    df =  pd.DataFrame(columns=columns)

    count = 0
    for dataF in df_per_item_name:
        count= count+1
        if count%200==0:
            print("Processed {} item_names".format(count))
        item_df = df_per_item_name[dataF]
        df = produce_decision_dataframe(item_df,df,incl_outliers=False,method=method,threshold=threshold,quantile=quantile)
        
    return df

def filter_decision_df(df, days=7, min_corr=0.1, min_no_features=2, min_std=5.0):
    
    min_trx = days*24
    
    df_filtered = df[(abs(df['corr_value'])>=min_corr) & \
                                          (df['transactions'] > min_trx) & \
                                          (df['st_div'] > min_std)]
    
    df_filtered['no_features'] = df_filtered.groupby('item_name')['item_name'].transform('count')
    df_filtered = df_filtered[df_filtered['no_features'] >= min_no_features]
    
    return df_filtered
    
def feature_selection(df, method="decision_tree",verbose=0,importance_threshold=0.15,no_of_features = 5):
    important_features = []
    train = df.copy()
    new_df = df.drop(['price_amount','time','date','socket_colors','time_hours','item_category'],axis=1,errors='ignore')
    if method=='decision_tree':
        model = RandomForestRegressor(random_state=20,max_depth=300)
        new_df = pd.get_dummies(new_df)
        model.fit(new_df,train.price_amount)
        features = new_df.columns
        importances = model.feature_importances_
        indices = np.argsort(importances)[-no_of_features:]  # top 10 features
        if verbose:
            for i in indices:
                print("Feature : {:40} --->importance [{}]".format(features[i][:40],importances[i].round(3)))
                      
        for i in indices:
            if importances[i] > importance_threshold:
                important_features.append(features[i])
        important_features.append('price_amount')
        return important_features
    
    elif method=='rfe':
        lreg = DecisionTreeRegressor()
        rfe = RFE(lreg, no_of_features)
        cols = new_df.columns
        rfe = rfe.fit(new_df, train.price_amount)
        for i in range(0,len(rfe.ranking_)):
            if verbose==1: print("Feature : {:40} has weight [{}]".format(cols[i][:35],rfe.ranking_[i]))
            if rfe.ranking_[i] <= 20:
                important_features.append(cols[i])
        important_features.append('price_amount')
        return important_features

# Start Here

In [9]:
directory = "C:/Users/Digi/Desktop/poe_price_predictor/record_creator/record_creator/csv/unique_7days/"
files_to_process = [x for x in glob.glob(directory+'*_values.csv')]

df_per_item_name = {}

for f in files_to_process:
    
    print("Execution start : {}".format(f[83:]))
    
    df = pd.read_csv(f)
    
    proc_df = pre_process_df(df)
    
    # Break the complete df to subdfs according to the unique item_names and
    # drop the columns that have no value greater than zero
    grouped_by_name_df_dict = split_df_to_unique_item_names(proc_df)
    
    print("Execution end : {}".format(f[83:]))
    
    df_per_item_name.update(grouped_by_name_df_dict)


Execution start : 7days\amulet_values.csv
Execution end : 7days\amulet_values.csv


In [10]:
bisco_df = df_per_item_name[bisco]
pd.set_option('display.max_rows',1000)

# df_by_zscore_2
# bisco_df_day_4 = remove_outliers_IQR(bisco_df.loc[bisco_df['date_day']==4])
# bisco_df_day_10 = remove_outliers_IQR(bisco_df.loc[bisco_df['date_day']==10])

# filtered_zscore_2_df['item_name'].nunique()
# filtered_zscore_3_df
# filtered_IQR_80_df['item_name'].nunique()
# filtered_IQR_90_df['item_name'].nunique()

#cols = list(bisco_df.filter(regex='(?=^co_|ex_|im_|en_$)(^.*$)').columns.values)
#bisco_df[['co_#% chance to block']] = bisco_df[['co_#% chance to block']].replace({0:np.NAN})
#cols
#bisco_df[['co_#% chance to block']]
# bisco_df_day_4[['price_amount']].var(axis=0)
# bisco_df_day_10[['price_amount']].var(axis=0)
# 'max = {}         min = {}'.format(bisco_df_day_10['price_amount'].max(),bisco_df_day_10['price_amount'].min())

# plt.hist(bisco_df_day_10['price_amount'],ec='black')
#bisco_df.filter(regex='(?!^co_|ex_|im_|en_$)(^.*$)').head(5)

In [20]:

filtered_zscore_2_df = filter_decision_df(df_by_zscore_2,days=7)
filtered_zscore_2_df['item_name'].nunique()
filtered_zscore_3_df['item_name'].nunique()
filtered_IQR_80_df['item_name'].nunique()
filtered_IQR_90_df['item_name'].nunique()
# bisco_df = df_per_item_name[bisco]
# cols = list(bisco_df.filter(regex='(?=^co_|ex_|im_|en_$)(^.*$)').columns.values)
# bisco_df[cols] = bisco_df[cols].replace({0:np.nan, 0.0:np.nan})
# bisco_df

filtered_zscore_2_df
filtered_zscore_2_df.to_csv("C:/Users/Digi/Desktop/day7_filtered_uniques_zscore_2.csv")
#filtered_zscore_3_df
#filtered_IQR_80_df
#filtered_IQR_90_df

129

Unnamed: 0,item_name,feature,corr_value,no_features,transactions,std
11,Sacrificial Heart Paua Amulet,corrupted,0.334759,3,5230,10.044717
15,Sacrificial Heart Paua Amulet,ex_conv_rate,-0.101873,3,5230,10.044717
16,Sacrificial Heart Paua Amulet,im_#% increased mana regeneration rate,-0.144048,3,5230,10.044717
18,Carnage Heart Onyx Amulet,ex_# to all attributes,0.310616,6,4259,9.344665
19,Carnage Heart Onyx Amulet,ex_#% increased damage while leeching,0.137536,6,4259,9.344665
20,Carnage Heart Onyx Amulet,ex_#% of physical attack damage leeched as life,0.157339,6,4259,9.344665
21,Carnage Heart Onyx Amulet,ex_#% to all elemental resistances,0.386724,6,4259,9.344665
22,Carnage Heart Onyx Amulet,ex_conv_rate,-0.197324,6,4259,9.344665
23,Carnage Heart Onyx Amulet,im_# to all attributes,0.129508,6,4259,9.344665
72,Astramentis Onyx Amulet,ex_# to all attributes,0.702225,3,1827,31.611286


In [31]:
filtered_zscore_2_df

Unnamed: 0,item_name,feature,corr_value,no_features,transactions,std
11,Sacrificial Heart Paua Amulet,corrupted,0.334759,3,182,10.044717
15,Sacrificial Heart Paua Amulet,ex_conv_rate,-0.101873,3,182,10.044717
16,Sacrificial Heart Paua Amulet,im_#% increased mana regeneration rate,-0.144048,3,182,10.044717
18,Carnage Heart Onyx Amulet,ex_# to all attributes,0.310616,6,310,9.344665
19,Carnage Heart Onyx Amulet,ex_#% increased damage while leeching,0.137536,6,310,9.344665
20,Carnage Heart Onyx Amulet,ex_#% of physical attack damage leeched as life,0.157339,6,310,9.344665
21,Carnage Heart Onyx Amulet,ex_#% to all elemental resistances,0.386724,6,310,9.344665
22,Carnage Heart Onyx Amulet,ex_conv_rate,-0.197324,6,310,9.344665
23,Carnage Heart Onyx Amulet,im_# to all attributes,0.129508,6,310,9.344665
72,Astramentis Onyx Amulet,ex_# to all attributes,0.702225,3,145,31.611286


In [12]:
filtered_IQR_80_df.to_csv("C:/Users/Digi/Desktop/csv/filtered_uniques_IQR_80.csv")
filtered_zscore_2_df.to_csv("C:/Users/Digi/Desktop/csv/filtered_uniques_zscore_2.csv")
filtered_IQR_90_df.to_csv("C:/Users/Digi/Desktop/csv/filtered_uniques_IQR_90.csv")
filtered_zscore_3_df.to_csv("C:/Users/Digi/Desktop/csv/filtered_uniques_zscore_3.csv")

df_by_zscore_2.to_csv("C:/Users/Digi/Desktop/csv/no_filter_uniques_zscore_2.csv")
df_by_IQR_80.to_csv("C:/Users/Digi/Desktop/csv/no_filter_uniques_IQR_80.csv")
df_by_zscore_3.to_csv("C:/Users/Digi/Desktop/csv/no_filter_uniques_zscore_3.csv")
df_by_IQR_90.to_csv("C:/Users/Digi/Desktop/csv/no_filter_uniques_IQR_90.csv")