In [86]:
import os
import json
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Cleaning Data

In [87]:
dir = '../data/raw/20231212/funds/'
symbol_list = []
category = []
asset_class = []
inception_date = []
min_inv = []
exp_ratio = []
return_ytd = []
return_1_year = []
return_3_year = []
return_5_year = []
return_10_year = []
return_inception = []
return_bench_ytd = []
return_bench_1_year = []
return_bench_3_year = []
return_bench_5_year = []
return_bench_10_year = []
return_bench_inception = []
risk_level = []


for filename in os.listdir(dir):
    file_path = os.path.join(dir,filename)
    with open(file_path,'r') as f:
        fund_dict = json.load(f)
    pattern = r'\/([A-Z]+)\.json'
    match = re.search(pattern, file_path)
    symbol = match.group(1)
    symbol_list.append(symbol)
    
    category.append(fund_dict['key_fact_table']['Category'])
    asset_class.append(fund_dict['key_fact_table']['Asset class'])
    if 'Inception date' in fund_dict['key_fact_table']:
        inception_date.append(fund_dict['key_fact_table']['Inception date'])
    else:
        inception_date.append(None)

    if 'risk_level' in fund_dict:
        risk_level.append(fund_dict['risk_level'])
    else:
        risk_level.append(None)

    if 'perf_table' in fund_dict:
        perf_table = fund_dict['perf_table']

        if symbol in perf_table:
            symbol_key = symbol
        elif symbol+'1' in perf_table:
            symbol_key = symbol + '1'
        elif symbol+'2' in perf_table:
            symbol_key = symbol+'2'
        elif symbol+'4' in perf_table:
            symbol_key = symbol+'4'
        else:
            symbol_key = symbol+' (Market price)'
        
        if 'Benchmark1' in perf_table:
            benchmark_key = 'Benchmark1'
        elif 'Benchmark3' in perf_table:
            benchmark_key = 'Benchmark3'
        else:
            benchmark_key = 'Benchmark'

        for row in zip(perf_table['index'],perf_table[symbol_key],perf_table[benchmark_key]):
            if row[0]=="YTD":
                return_ytd.append(row[1])
                return_bench_ytd.append(row[2])
            if row[0]=="1-yr":
                return_1_year.append(row[1])
                return_bench_1_year.append(row[2])
            if row[0]=="3-yr":
                return_3_year.append(row[1])
                return_bench_3_year.append(row[2])
            if row[0]=="5-yr":
                return_5_year.append(row[1])
                return_bench_5_year.append(row[2])
            if row[0]=="10-yr":
                return_10_year.append(row[1])
                return_bench_10_year.append(row[2])
            if row[0]=="Since inception":
                return_inception.append(row[1]) 
                return_bench_inception.append(row[2])
    else:
        return_ytd.append(None)
        return_bench_ytd.append(None)
        return_1_year.append(None)
        return_bench_1_year.append(None)
        return_3_year.append(None)
        return_bench_3_year.append(None)
        return_5_year.append(None)
        return_bench_5_year.append(None)
        return_10_year.append(None)
        return_bench_10_year.append(None)
        return_inception.append(None)
        return_bench_inception.append(None)

    if 'min_investment' in fund_dict:
        min_inv.append(fund_dict['min_investment'])
    else:
        min_inv.append(None)
    
    if 'exp_ratio' in fund_dict:
        exp_ratio.append(fund_dict['exp_ratio'])
    else:
        exp_ratio.append(None)    

In [88]:
table_col = ['symbol_list','category','asset_class','inception_date','min_inv','exp_ratio','return_ytd','return_1_year','return_3_year','return_5_year','return_10_year','return_inception','return_bench_ytd','return_bench_1_year','return_bench_3_year','return_bench_5_year','return_bench_10_year','return_bench_inception','risk_level']

fund_df = pd.DataFrame(columns=table_col,data=list(zip(symbol_list,category,asset_class,inception_date,min_inv,exp_ratio,return_ytd,return_1_year,return_3_year,return_5_year,return_10_year,return_inception,return_bench_ytd,return_bench_1_year,return_bench_3_year,return_bench_5_year,return_bench_10_year,return_bench_inception,risk_level)))

In [89]:
fund_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   symbol_list             350 non-null    object
 1   category                350 non-null    object
 2   asset_class             350 non-null    object
 3   inception_date          350 non-null    object
 4   min_inv                 345 non-null    object
 5   exp_ratio               350 non-null    object
 6   return_ytd              350 non-null    object
 7   return_1_year           350 non-null    object
 8   return_3_year           350 non-null    object
 9   return_5_year           350 non-null    object
 10  return_10_year          350 non-null    object
 11  return_inception        350 non-null    object
 12  return_bench_ytd        350 non-null    object
 13  return_bench_1_year     350 non-null    object
 14  return_bench_3_year     350 non-null    object
 15  return

In [90]:
def perc_to_decimal(perc_string):
    if perc_string:
        return round(float((perc_string.strip().replace('%','')))/100,5)
    else:
        return perc_string

In [96]:
def clean_risk_level(risk_string):
    return int(risk_string.strip())

In [91]:
# Remove funds with null min_inv --> Closed not allowed to be bought by investor anymore
cleaned_df = fund_df.copy()
cleaned_df = cleaned_df[cleaned_df['min_inv'].notna()]

# replace unicode dash with Null value
cleaned_df = cleaned_df.replace('\u2014',None)

# clean risk level
cleaned_df['risk_level'] = cleaned_df['risk_level'].apply(clean_risk_level)

# convert percentages string to float
percentage_cols = ['exp_ratio','return_ytd','return_1_year','return_3_year','return_5_year','return_10_year','return_inception','return_bench_ytd','return_bench_1_year','return_bench_3_year','return_bench_5_year','return_bench_10_year','return_bench_inception']
for col in percentage_cols:
    print(col)
    cleaned_df[col] = cleaned_df[col].apply(perc_to_decimal)

cleaned_df['inception_date'] = cleaned_df['inception_date'].astype(pd.to_datetime)

exp_ratio
return_ytd
return_1_year
return_3_year
return_5_year
return_10_year
return_inception
return_bench_ytd
return_bench_1_year
return_bench_3_year
return_bench_5_year
return_bench_10_year
return_bench_inception


In [99]:
cleaned_df

Unnamed: 0,symbol_list,category,asset_class,inception_date,min_inv,exp_ratio,return_ytd,return_1_year,return_3_year,return_5_year,return_10_year,return_inception,return_bench_ytd,return_bench_1_year,return_bench_3_year,return_bench_5_year,return_bench_10_year,return_bench_inception,risk_level
0,VTIVX,Target-Date 2041-2045,Balanced,10/27/2003,"$1,000",0.0008,0.1385,0.0935,0.0453,0.0821,0.0759,0.0775,0.1404,0.1021,0.0483,0.0865,0.0792,0.0797,4
1,VTMSX,Small Blend,Domestic Stock - More Aggressive,03/25/1999,"$10,000",0.0009,0.0285,-0.0408,0.0584,0.0570,0.0750,0.0978,0.0289,-0.0402,0.0585,0.0563,0.0751,0.0972,5
2,VCORX,Intermediate-Term Bond,Intermediate-Term Bond,03/28/2016,"$3,000",0.0020,0.0198,0.0151,-0.0439,0.0135,,0.0105,0.0178,0.0132,-0.0446,0.0079,0.0140,0.0075,2
3,VBAIX,Moderate Allocation,Balanced,12/01/2000,"$5,000,000",0.0006,0.1231,0.0802,0.0308,0.0753,0.0739,0.0651,0.1240,0.0824,0.0332,0.0786,0.0761,0.0664,3
4,VWILX,Foreign Large Growth,International/Global Stock,08/13/2001,"$50,000",0.0034,0.0958,0.0417,-0.0711,0.0804,0.0684,0.0726,0.1009,0.0926,0.0167,0.0506,0.0341,0.0464,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,VTPSX,Foreign Large Blend,International/Global Stock,11/30/2010,"$100,000,000",0.0007,0.0995,0.0756,0.0204,0.0527,0.0375,0.0463,0.1009,0.0952,0.0201,0.0538,0.0378,0.0463,5
346,VBTLX,Intermediate-Term Bond,Intermediate-Term Bond,11/12/2001,"$3,000",0.0005,0.0193,0.0132,-0.0447,0.0074,0.0135,0.0314,0.0178,0.0132,-0.0446,0.0079,0.0140,0.0327,2
347,VBPIX,World Stock,International/Global Stock,12/14/2017,"$3,000",0.0059,0.0801,0.0279,-0.0499,0.1359,,0.1271,0.1660,0.1201,0.0569,0.0907,0.0760,0.0737,4
348,VITPX,Large Blend,Domestic Stock - General,05/31/2001,"$100,000,000",0.0002,0.1969,0.1266,0.0818,0.1172,0.1119,0.0829,0.1961,0.1259,0.0814,0.1169,0.1116,0.0823,4
