In [70]:
import pandas as pd
import html5lib
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import json
import scipy.stats as stats

In [71]:
def load_raw():
    dataframes=[]
    for k in tqdm(range(2001,2019)):
        for i in range(1,13):
            df = pd.read_html('./data/'+str(k)+'Y_'+str(i)+'M.xls',encoding='utf-8',header=0)
            dataframes.append(df[0])
    return dataframes

In [72]:
dataframes=load_raw()

100%|██████████| 18/18 [00:38<00:00,  4.38s/it]


In [73]:
df_nans=[]
for df in dataframes:
    df_nan=df[df.isna().any(1)]
    df_nan=df_nan.dropna(axis='columns')
    # 2007/12 2013/12 change
    df_nan.rename(columns={'類型代號':"基金名稱","基金統編":"基金規模","基金規模":"本月定時定額扣款筆數","單位淨值":"本月定時定額扣款人數","本月定時定額扣款筆數":"本月定時定額扣款金額"})
    df_nans.append(df_nan)

In [74]:
df_corrects=[]
df_filter = []
# for df in dataframes:
#     df_correct = df[~df.isna().any(1)]
#     df_corrects.append(df_correct)
for df in tqdm(dataframes):
    df.rename(columns={"基金規模(台幣)":"基金規模","單位淨值(台幣)":"單位淨值","本月定時定額扣款金額(台幣)":"本月定時定額扣款金額"}, inplace=True)
    df_correct = df[~df.isna().any(1)]
    df_filter = []
    for name in df_correct['基金名稱']:
        if ("中國" in name or "俄羅斯" in name or "巴西" in name or "印度" in name or "醫療" in name or "生技" in name) :
            if("中國信託" in name):
                continue
            filter1 = (df_correct['類型代號'] == "AA2")
            filter2 = (df_correct['類型代號'] == "2")
            filter3 = (df_correct['基金名稱'] == name)
            df_filter.append(df_correct[((filter1 | filter2) & filter3)])
    df_corrects.append(pd.concat(df_filter))
df_corrects[0]# 0=2001

100%|██████████| 216/216 [00:24<00:00,  1.80it/s]


Unnamed: 0,類型代號,基金統編,基金名稱,基金規模,單位淨值,本月定時定額扣款筆數,本月定時定額扣款人數,本月定時定額扣款金額
86,2,97990289,元大中國基金,436898521,6.54,8,8.0,63000.0


In [75]:
def plot_image(id):
    navs=[]
    money=[]
    num=[]
    people=[]
    scales=[]
    id = str(id)
    id = id.zfill(8)
    for i in range(0,2*12):
        try:
            navs.append(df_corrects[i][df_corrects[i]['基金統編']==id]['單位淨值'].values[0])
            money.append(df_corrects[i][df_corrects[i]['基金統編']==id]['本月定時定額扣款金額'].values[0])
            num.append(df_corrects[i][df_corrects[i]['基金統編']==id]['本月定時定額扣款筆數'].values[0])
            people.append(df_corrects[i][df_corrects[i]['基金統編']==id]['本月定時定額扣款筆數'].values[0])
            scales.append(df_corrects[i][df_corrects[i]['基金統編']==id]['基金規模'].values[0])
        except:
            pass
    plt.plot(navs)
    plt.title('Unit net value')
    plt.ylabel('Value')
    plt.xlabel('Months')
    plt.show()
    plt.title('SIP value')
    plt.plot(money)
    plt.ylabel('Value')
    plt.xlabel('Months')
    plt.show()
    plt.title('SIP Number of records')
    plt.plot(num)
    plt.ylabel('Value')
    plt.xlabel('Months')
    plt.show()
    plt.title('SIP Number of people')
    plt.plot(people)
    plt.ylabel('Value')
    plt.xlabel('Months')
    plt.show()
    plt.title('SIP Scale')
    plt.plot(scales)
    plt.ylabel('Value')
    plt.xlabel('Months')
    plt.show()
    

## Function of calculating rising or falling

In [76]:
def monthFilter(month, length, data):
    subData = np.array(data[(month - (month % 6)) : (month + (5-(month % 6)) + 1)])
    value=(data[month] - np.mean(subData)) / np.std(subData)
    if(value> 1 or value < -1):
        return value
    else:
        return 0

## This function will get various people,aliveList,variations

In [77]:
def preprocessing():
    id_list = []
    id_temp = []
    aliveList=[]
    uid=[]
    upeople=[]
    peopleChinaList = []
    peopleIndiaList = []
    peopleBrazilList = []
    peopleRussiaList = []
    peopleMedicalList = []
    markMonthChinaList =[]
    markMonthIndiaList = []
    markMonthBrazilList = []
    markMonthRussiaList = []
    markMonthlMedicalList = []
    MonthChinaValue=[]
    MonthIndiaValue=[]
    MonthRussiaValue=[]
    MonthBrazilValue=[]
    for df in df_corrects:
        for filter_id in df['基金統編']:
#             id = str(filter_id)
#             id = id.zfill(8)
            id_temp.append(filter_id)
    id_list = list(set(id_temp))
    for id in id_list:
            isAlive = True
            navs=[]
            money=[]
            num=[]
            people=[]
            scales=[]
            name = ''
            isSaveList = []
            for i in range((17-3)*12 + 4,17*12 + 4):
                try:
                    name = df_corrects[i][df_corrects[i]['基金統編']==id]['基金名稱'].values[0]
                    navs.append(df_corrects[i][df_corrects[i]['基金統編']==id]['單位淨值'].values[0])
                    money.append(df_corrects[i][df_corrects[i]['基金統編']==id]['本月定時定額扣款金額'].values[0])
                    num.append(df_corrects[i][df_corrects[i]['基金統編']==id]['本月定時定額扣款筆數'].values[0])
                    people.append(df_corrects[i][df_corrects[i]['基金統編']==id]['本月定時定額扣款人數'].values[0])
                    scales.append(df_corrects[i][df_corrects[i]['基金統編']==id]['基金規模'].values[0])
                    if(df_corrects[i][df_corrects[i]['基金統編']==id]['本月定時定額扣款人數'].values[0]) == 0 :
                        isAlive = False
                    if(i == 17*12 + 3 and isAlive):
                        aliveList.append(df_corrects[i][df_corrects[i]['基金統編']==id]['基金名稱'].values[0])
                        uid.append(df_corrects[i][df_corrects[i]['基金統編']==id]['基金統編'].values[0])
                
                except:
                    isAlive = False
                    navs.append(0)
                    money.append(0)
                    num.append(0)
                    people.append(0)
                    scales.append(0)
                    break
            funding={}
            funding
            if(isAlive):
                for i in range(0, len(people)):
                    if("中國" in name):
                        try:
                            peopleChinaList[i] += people[i]
                        except:
                            peopleChinaList.append(people[i])
                    elif("印度" in name):
                        try:
                            peopleIndiaList[i] += people[i]
                        except:
                            peopleIndiaList.append(people[i])
                    elif("巴西" in name):
                        try:
                            peopleBrazilList[i] += people[i]
                        except:
                            peopleBrazilList.append(people[i])
                    elif("俄羅斯" in name):
                        try:
                            peopleRussiaList[i] += people[i]
                        except:
                            peopleRussiaList.append(people[i])
                    else:
                        try:
                            peopleMedicalList[i] += people[i] 
                        except:
                            peopleMedicalList.append(people[i])
#                 plt.plot(navs)
#                 plt.title(id + 10*' ' +'Unit net value')
#                 plt.ylabel('Value')
#                 plt.xlabel('Months')
#                 plt.show()
#                 plt.title(id + 10*' ' +'SIP value')
#                 plt.plot(money)
#                 plt.ylabel('Value')
#                 plt.xlabel('Months')
#                 plt.show()
#                 plt.title(id + 10*' ' +'SIP Number of records')
#                 plt.plot(num)
#                 plt.ylabel('Value')
#                 plt.xlabel('Months')
#                 plt.show()
#                 plt.title(id + 10*' ' +'SIP Number of people')
#                 plt.plot(people)
#                 plt.ylabel('Value')
#                 plt.xlabel('Months')
#                 plt.show()
#                 plt.title(id + 10*' ' +'SIP Scale')
#                 plt.plot(scales)
#                 plt.ylabel('Value')
#                 plt.xlabel('Months')
#                 plt.show()

    # Calculate people and value
    
    china={}
    for i in range(0, len(peopleChinaList)):
        value=monthFilter(i, 6, peopleChinaList)
        if value!=0:
            markMonthChinaList.append(i)
            MonthChinaValue.append(value)
    china={
        'Month_Index':markMonthChinaList,
        'Value':MonthChinaValue
    }
    brazil={}
    for i in range(0, len(peopleBrazilList)):
        value=monthFilter(i, 6, peopleBrazilList)
        if value!=0:
            markMonthBrazilList.append(i)
            MonthBrazilValue.append(value)
    brazil={
        'Month_Index':markMonthBrazilList,
        'Value':MonthBrazilValue
    }
    for i in range(0, len(peopleIndiaList)):
        value=monthFilter(i, 6, peopleIndiaList)
        if value!=0:
            markMonthIndiaList.append(i)
            MonthIndiaValue.append(value)
    india={
        'Month_Index':markMonthBrazilList,
        'Value':MonthBrazilValue
    }
    for i in range(0, len(peopleRussiaList)):
        value=monthFilter(i, 6, peopleIndiaList)
        if value!=0:
            markMonthIndiaList.append(i)
            MonthIndiaValue.append(value)
    russia={
        'Month_Index':markMonthRussiaList,
        'Value':MonthRussiaValue
    }
    variation={
        'China':pd.DataFrame(china),
        'Brazil':pd.DataFrame(brazil),
        'India':pd.DataFrame(india),
        'Russia':pd.DataFrame(russia)
    }
    
#     for i in range(0, len(peopleMedicalList)):
#         if(monthFilter(i, 6, peopleMedicalList)):
#             markMonthlMedicalList.append(i)
#     plt.title('SIP Number of people in Medical')
#     plt.plot(peopleMedicalList)
#     plt.ylabel('Value')
#     plt.xlabel('Months')
#     plt.show()
    china_df=pd.DataFrame(china)
    dict={}
    start_date = "2016/04" 
    stop_date="2019/03"
    start=datetime.strptime(start_date,"%Y/%m")
    stop=datetime.strptime(stop_date,"%Y/%m")
    arr_smonth=[]
    while start<=stop:
        arr_smonth.append(start.strftime('%Y/%m')) # Return a string representing the date and time
        start = start +  relativedelta(months=1)
    percentage_c=[]
    percentage_b=[]
    percentage_r=[]
    percentage_i=[]
    for i in range(len(peopleChinaList)):
        total=peopleChinaList[i]+peopleIndiaList[i]+peopleRussiaList[i]+peopleBrazilList[i]
        percentage_c.append((peopleChinaList[i]/total)*100)
        percentage_b.append((peopleBrazilList[i]/total)*100)
        percentage_r.append((peopleRussiaList[i]/total)*100)
        percentage_i.append((peopleIndiaList[i]/total)*100)
        
        
    
    dict={
        'Date':arr_smonth,
        'China':peopleChinaList,
        'India':peopleIndiaList,
        'Russia':peopleRussiaList,
        'Brazil':peopleBrazilList,
        'China%':percentage_c,
        'India%':percentage_i,
        'Russia%':percentage_r,
        'Brazil%':percentage_b
    }
    alive={
        "name":aliveList,
        "uid":uid
    }
    
 
    peopleDF=pd.DataFrame(dict)
    aliveDF=pd.DataFrame(alive)
    return aliveList,aliveDF,variation,peopleDF



In [78]:
aliveList,aliveDF,varation ,people= preprocessing()
varation['China']

Unnamed: 0,Month_Index,Value
0,0,-1.856616
1,9,-2.054153
2,12,1.92339
3,17,-1.15112
4,20,-1.889533
5,22,1.122276
6,24,-1.470169
7,29,1.457625
8,33,-1.806295


In [79]:
people

Unnamed: 0,Date,China,India,Russia,Brazil,China%,India%,Russia%,Brazil%
0,2016/04,19451.0,2267.0,2104.0,3244.0,71.865071,8.375822,7.77359,11.985517
1,2016/05,22223.0,2247.0,2058.0,3210.0,74.729303,7.555989,6.920438,10.79427
2,2016/06,22482.0,2231.0,2040.0,3153.0,75.17555,7.460041,6.821374,10.543035
3,2016/07,22306.0,2442.0,1965.0,2971.0,75.144859,8.226654,6.619728,10.008759
4,2016/08,20813.0,2413.0,1881.0,2754.0,74.70299,8.660852,6.751373,9.884785
5,2016/09,21304.0,2360.0,1811.0,2597.0,75.890567,8.406954,6.451268,9.251211
6,2016/10,21493.0,2390.0,1738.0,2490.0,76.457614,8.50201,6.182633,8.857743
7,2016/11,21451.0,2314.0,1710.0,2375.0,77.023339,8.308797,6.140036,8.527828
8,2016/12,20909.0,2231.0,1663.0,2222.0,77.369103,8.255319,6.153562,8.222017
9,2017/01,17905.0,2046.0,1535.0,2062.0,76.036181,8.688636,6.5186,8.756582


In [80]:
newAliveList = []
for name in aliveList:
    if("新臺幣" in name):
        name = name.replace("新臺幣", "台幣")
    if("新台幣" in name):
        name = name.replace("新台幣", "台幣")
    if("台幣類型" in name):
        name = name.replace("台幣類型", "台幣")
    if("N" in name):
        name = name.replace("N", "N類型")
    if("-N類型" in name):
        name = name.replace("-N類型", "")
        name = name.replace("基金", "基金N類型")
    if("級別" in name):
        if("台幣" in name):
            name = name.replace("台幣級別", "(台幣)")
        if("人民幣" in name):
            name = name.replace("人民幣級別", "(人民幣)")
        if("美元" in name):
            name = name.replace("美元級別", "(美元)")
    if("-" in name):
        name = name.replace("-", "(", 1) + ")"
    if("人民幣類型" in name):
        name = name.replace("類型", "")
    if("兆豐國際中國A股基金(美金)" in name):
        name = name.replace("美金", "美元")
    if("安聯中國策略基金" in name):
        name = name + "(台幣)"
    if("台新中國精選中小基金" in name):
        name = name + "(台幣)"
    if("摩根中國A股基金" in name):
        if("摩根中國A股基金(美元)" in name):
            pass
        else:
            name = name + "(台幣)"
    if("瀚亞中國" in name):
        name = name.replace("中國", "中國A股")
    if("野村新興傘型基金之大俄羅斯基金" in name):
        name = name.replace("野村新興傘型基金之大俄羅斯基金", "野村大俄羅斯基金")
    if("野村雙印傘型基金之印度潛力基金" in name):
        name = name.replace("野村雙印傘型基金之印度潛力基金", "野村印度潛力基金")
    newAliveList.append(name)
# newAliveList

In [81]:
newAliveList

['兆豐國際中國A股基金(台幣)',
 '安聯全球生技趨勢基金(台幣)',
 '台新印度基金',
 '保德信全球醫療生化基金(台幣)',
 '安聯中國東協基金',
 '群益全球關鍵生技基金(台幣)',
 '日盛中國內需動力基金',
 '元大印度基金',
 '野村印度潛力基金',
 '安聯中國策略基金(台幣)',
 '兆豐國際中國A股基金(美元)',
 '瀚亞巴西基金',
 '野村全球生技醫療基金',
 '國泰中國內需增長基金(台幣)',
 '摩根中國亮點基金',
 '復華中國新經濟A股基金(台幣)',
 '野村中國機會基金',
 '群益中國新機會基金(台幣)',
 '野村大俄羅斯基金',
 '台新中國精選中小基金(台幣)',
 '野村巴西基金',
 '第一金中國世紀基金(台幣)',
 '德信中國精選成長基金']

In [82]:
# df_corrects[172][df_corrects[172]['基金名稱'] =='瀚亞巴西基金']['本月定時定額扣款人數']

In [83]:
aliveDF

Unnamed: 0,name,uid
0,兆豐國際中國A股基金(台幣),38592074A
1,安聯全球生技趨勢基金-新臺幣,14693593A
2,台新印度基金,26322646
3,保德信全球醫療生化基金-新臺幣,14692638A
4,安聯中國東協基金,25680150
5,群益全球關鍵生技基金-新臺幣,38486708A
6,日盛中國內需動力基金,26323737
7,元大印度基金,25622390
8,野村雙印傘型基金之印度潛力基金,26278374
9,安聯中國策略基金,25589337


In [84]:
def country_classifier(newAliveList):
    country=dict()
    china_id=[]
    china_n=[]
    india_id=[]
    india_n=[]
    brazil_id=[]
    brazil_n=[]
    russia_id=[]
    russia_n=[]
    for funding,uid in zip(newAliveList['name'],newAliveList['uid']):
        if '中國' in funding:
            china_id.append(uid)
            china_n.append(funding)
        elif '巴西' in funding:
            brazil_id.append(uid)
            brazil_n.append(funding)
        elif '印度' in funding:
            india_id.append(uid)
            india_n.append(funding)
        elif '俄羅斯' in funding:
            russia_id.append(uid)
            russia_n.append(funding)
    country_id={
        'China':china_id,
        'Brazil':brazil_id,
        'India':india_id,
        'Russia':russia_id
    }
    country_n={
        'China':china_n,
        'Brazil':brazil_n,
        'India':india_n,
        'Russia':russia_n
        
    }
    return country_n,country_id

In [85]:
result_n,result_id=country_classifier(aliveDF)

In [86]:
def get_funding(dataframe):
    start_date = "2016/04" 
    stop_date="2019/03"
    start=datetime.strptime(start_date,"%Y/%m")
    stop=datetime.strptime(stop_date,"%Y/%m")
    arr_smonth=[]
    while start<=stop:
        arr_smonth.append(start.strftime('%Y/%m')) 
        start = start +  relativedelta(months=1)
    sip_people={}
    sip_people['Date']=arr_smonth
    for name,item in zip(dataframe['name'],dataframe['uid']):
        arr=[]
        for i in range(14*12+4,17*12+4):
            a=df_corrects[i][df_corrects[i]['基金統編']==item]['本月定時定額扣款人數'].values[0]
            arr.append(a)
        sip_people[item]=arr        
    return pd.DataFrame(sip_people)

In [87]:
result_n['China']

['兆豐國際中國A股基金(台幣)',
 '安聯中國東協基金',
 '日盛中國內需動力基金',
 '安聯中國策略基金',
 '兆豐國際中國A股基金(美金)',
 '國泰中國內需增長基金台幣級別',
 '摩根中國亮點基金',
 '復華中國新經濟A股基金-新臺幣',
 '野村中國機會基金',
 '群益中國新機會基金-新臺幣',
 '台新中國精選中小基金',
 '第一金中國世紀基金-新臺幣',
 '德信中國精選成長基金']

In [88]:
result_id['China']

['38592074A',
 '25680150',
 '26323737',
 '25589337',
 '38592074B',
 '26317735A',
 '25627402',
 '42303511A',
 '25530596',
 '38524463A',
 '25691213',
 '25620512A',
 '25689835']

In [89]:
def get_moneys(ids):
    sip_moneys={}
    for item in ids:
        sip_money=[]
        for i in range(14*12+4,17*12+4):
                    a=df_corrects[i][df_corrects[i]['基金統編']==item]['本月定時定額扣款金額'].values[0]
                    sip_money.append(a)
                    sip_moneys[item]=sip_money
    return pd.DataFrame(sip_moneys)

In [90]:
china_sip_money=get_moneys(result_id['China'])
for item in  result_id['China']:
    china_sip_money[item]= china_sip_money[item]/china_sip_money[item].max()
china_sip_money.columns = result_n['China']

In [91]:
brazil_sip_money=get_moneys(result_id['Brazil'])
for item in  result_id['Brazil']:
    brazil_sip_money[item]= brazil_sip_money[item]/brazil_sip_money[item].max()
brazil_sip_money.columns = result_n['Brazil']

In [92]:
russia_sip_money=get_moneys(result_id['Russia'])
for item in  result_id['Russia']:
    russia_sip_money[item]= russia_sip_money[item]/russia_sip_money[item].max()
russia_sip_money.columns = result_n['Russia']
# russia_sip_money

In [93]:
india_sip_money=get_moneys(result_id['India'])
for item in  result_id['India']:
    india_sip_money[item]= india_sip_money[item]/india_sip_money[item].max()
india_sip_money.columns = result_n['India']

# 計算相關係數--人數、價格、消息面

In [94]:

def get_country_emo():
    countrys=['china','brazil','russia','india']
    d_dict={}

    for country in countrys:
        with open('../emotion/'+country+'_emo.json', 'r') as read_file:
            arr=[]
            dict_data = json.load(read_file)
            arr= [values for key,values in  dict_data.items()]
            d_dict[country]=arr
    return pd.DataFrame(d_dict)
emo_df=get_country_emo()
emo_df=emo_df.fillna(value=0)

In [95]:

def get_money_people(id_data,name_data,people_dataframe):
    dfs=get_funding(people_dataframe)
    sip_money=get_moneys(id_data)
    a={}
    b={}
    df=dfs[id_data]
    for item in id_data:
        a[item]=sip_money[item]
        b[item]=df[item]
    correlation = {}
    for item in id_data:
        correlation[item]=stats.pearsonr(a[item],b[item])[0]
    cor_dataframe=pd.DataFrame(correlation,index=[0])
    cor_dataframe.columns=name_data
    return cor_dataframe

In [128]:
def get_money_shift_people(id_data,name_data,people_dataframe):
    dfs=get_funding(people_dataframe)
    sip_money=get_moneys(id_data)
    
    a={}
    b={}
    df=dfs[id_data]
    for item in id_data:
        a[item]=sip_money[item][1:] # header index
        b[item]=df[item][:-1] # object index
#     import ipdb;ipdb.set_trace()
    correlation = {}
    for item in id_data:
        correlation[item]=stats.pearsonr(a[item],b[item])[0]
    cor_dataframe=pd.DataFrame(correlation,index=[0])
    cor_dataframe.columns=name_data
    return cor_dataframe

In [None]:
df_money_people=get_money_shift_people(result_id['China'],result_n['China'],aliveDF)

> [0;32m<ipython-input-128-e8332b3dc2bf>[0m(12)[0;36mget_money_shift_people[0;34m()[0m
[0;32m     11 [0;31m    [0;32mimport[0m [0mipdb[0m[0;34m;[0m[0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 12 [0;31m    [0mcorrelation[0m [0;34m=[0m [0;34m{[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m    [0;32mfor[0m [0mitem[0m [0;32min[0m [0mid_data[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> a
id_data = ['38592074A', '25680150', '26323737', '25589337', '38592074B', '26317735A', '25627402', '42303511A', '25530596', '38524463A', '25691213', '25620512A', '25689835']
name_data = ['兆豐國際中國A股基金(台幣)', '安聯中國東協基金', '日盛中國內需動力基金', '安聯中國策略基金', '兆豐國際中國A股基金(美金)', '國泰中國內需增長基金台幣級別', '摩根中國亮點基金', '復華中國新經濟A股基金-新臺幣', '野村中國機會基金', '群益中國新機會基金-新臺幣', '台新中國精選中小基金', '第一金中國世紀基金-新臺幣', '德信中國精選成長基金']
people_dataframe =                name        uid
0    兆豐國際中國A股基金(台幣)  38592074A
1    安聯全球生技趨勢基金-新臺幣  14693593A


ipdb> print(b)
{'38592074A': 0      971.0
1      998.0
2     1016.0
3     1006.0
4      995.0
5      991.0
6     1000.0
7      999.0
8      978.0
9      971.0
10     966.0
11     949.0
12     938.0
13     940.0
14     933.0
15     937.0
16     936.0
17     925.0
18     935.0
19     941.0
20     921.0
21     935.0
22     937.0
23     927.0
24     929.0
25     953.0
26     989.0
27    1016.0
28    1065.0
29    1116.0
30    1178.0
31    1222.0
32    1252.0
33    1315.0
34    1353.0
Name: 38592074A, dtype: float64, '25680150': 0     1295.0
1     1293.0
2     1258.0
3     1215.0
4      964.0
5     1233.0
6     1245.0
7     1216.0
8     1192.0
9     1162.0
10    1155.0
11    1132.0
12    1110.0
13    1099.0
14    1091.0
15    1096.0
16    1107.0
17    1108.0
18    1098.0
19    1093.0
20    1075.0
21    1070.0
22    1083.0
23    1070.0
24    1065.0
25    1063.0
26    1096.0
27    1111.0
28    1112.0
29    1127.0
30    1134.0
31    1139.0
32    1157.0
33    1194.0
34    1251.0
Name: 25680150, 

In [96]:
def get_news_people(emo_df,id_data,name_data,people_dataframe):
    percentage=emo_df
    dfs=get_funding(people_dataframe)
    b={}
    df=dfs[id_data]
    correlation={}
    for item in id_data:
        b[item]=df[item]
    for item in id_data:
        correlation[item]=stats.pearsonr(percentage,b[item])[0]
    cor_dataframe=pd.DataFrame(correlation,index=[0])
    cor_dataframe.columns=name_data
    return cor_dataframe

In [97]:
def get_news_money(emo_df,id_data,name_data):
    sip_money=get_moneys(id_data)
    percentage=emo_df
    a={}
    for item in id_data:
        a[item]=sip_money[item]
    correlation = {}
    for item in id_data:
        correlation[item]=stats.pearsonr(percentage,a[item])[0]
    
    cor_dataframe=pd.DataFrame(correlation,index=[0])
    
    cor_dataframe.columns=name_data
    return cor_dataframe

In [98]:
def merge(df_money_people,df_news_people,df_news_money):
    df_all=pd.concat([df_money_people,df_news_people,df_news_money],axis=0, ignore_index=True)
    df_all=df_all.rename(index={0:"money_people",1:"news_people",2:"news_money"})
    return df_all

In [101]:
df_money_people=get_money_people(result_id['China'],result_n['China'],aliveDF)
df_news_people=get_news_people(emo_df['china'],result_id['China'],result_n['China'],aliveDF)
df_news_money=get_news_money(emo_df['china'],result_id['China'],result_n['China'])
china=merge(df_money_people,df_news_people,df_news_money)

In [102]:
russia_money_people=get_money_people(result_id['Russia'],result_n['Russia'],aliveDF)
russia_news_money=get_news_money(emo_df['russia'],result_id['Russia'],result_n['Russia'])
russia_news_people=get_news_people(emo_df['russia'],result_id['Russia'],result_n['Russia'],aliveDF)
russia=merge(russia_money_people,russia_news_people,russia_news_money)

In [103]:
brazil_money_people=get_money_people(result_id['Brazil'],result_n['Brazil'],aliveDF)
brazil_news_money=get_news_money(emo_df['brazil'],result_id['Brazil'],result_n['Brazil'])
brazil_news_people=get_news_people(emo_df['brazil'],result_id['Brazil'],result_n['Brazil'],aliveDF)
brazil=merge(brazil_money_people,brazil_news_people,brazil_news_money)

In [104]:
india_money_people=get_money_people(result_id['India'],result_n['India'],aliveDF)
india_news_money=get_news_money(emo_df['india'],result_id['India'],result_n['India'])
india_news_people=get_news_people(emo_df['india'],result_id['India'],result_n['India'],aliveDF)
india=merge(india_money_people,india_news_money,india_news_people)

In [114]:
import os
os.makedirs("correlation", exist_ok=True)
f = open('./correlation/china_corr.json','w')
f.write(china.to_json(force_ascii=False))
f.close()

In [115]:
f = open('./correlation/russia_corr.json','w')
f.write(russia.to_json(force_ascii=False))
f.close()

In [116]:
f = open('./correlation/india_corr.json','w')
f.write(india.to_json(force_ascii=False))
f.close()

In [117]:
f = open('./correlation/brazil_corr.json','w')
f.write(india.to_json(force_ascii=False))
f.close()

In [36]:
df=get_funding(aliveDF)
df_china=df[result_id['China']]
df_china.columns = [result_n['China']]
df_china
print(df_china.to_json(force_ascii=False))
f = open('china.json','w')
f.write(df_china.to_json(force_ascii=False))
f.close()

{"["日盛中國內需動力基金"]":{"0":548,"1":567,"2":575,"3":568,"4":556,"5":545,"6":536,"7":533,"8":525,"9":494,"10":515,"11":511,"12":498,"13":488,"14":498,"15":493,"16":484,"17":487,"18":463,"19":437,"20":425,"21":415,"22":426,"23":420,"24":398,"25":404,"26":394,"27":396,"28":398,"29":408,"30":410,"31":410,"32":409,"33":392,"34":406,"35":395},"["野村中國機會基金"]":{"0":1540,"1":1558,"2":1503,"3":1505,"4":1436,"5":1406,"6":1386,"7":1365,"8":1365,"9":1290,"10":1285,"11":1259,"12":1243,"13":1231,"14":1233,"15":1244,"16":1235,"17":1233,"18":1219,"19":1220,"20":1216,"21":1215,"22":1223,"23":1199,"24":1201,"25":1222,"26":1259,"27":1288,"28":1343,"29":1423,"30":1557,"31":1657,"32":1685,"33":1746,"34":1794,"35":1811},"["第一金中國世紀基金-新臺幣"]":{"0":4285,"1":5539,"2":5556,"3":5523,"4":5377,"5":5200,"6":5233,"7":5188,"8":4886,"9":4139,"10":5181,"11":4698,"12":4622,"13":4598,"14":4557,"15":4614,"16":4635,"17":4633,"18":4521,"19":4423,"20":3845,"21":3896,"22":5240,"23":4301,"24":4269,"25":4285,"26":4155,"27":3982,"28":393

In [37]:
df_india=df[result_id['India']]
df_india.columns = [result_n['India']]
df_india
f = open('india.json','w')
f.write(df_india.to_json(force_ascii=False))
f.close()

In [39]:
df_russia=df[result_id['Russia']]
df_russia.columns = [result_n['Russia']]
df_russia
f = open('russia.json','w')
f.write(df_russia.to_json(force_ascii=False))
f.close()

In [38]:
df_brazil=df[result_id['Brazil']]
df_brazil.columns = [result_n['Brazil']]
f = open('brazil.json','w')
f.write(df_brazil.to_json(force_ascii=False))
f.close()

In [80]:
f = open('russia_percentage.json','w')
f.write(russia_sip_money.to_json(force_ascii=False))
f.close()

In [81]:
f = open('india_percentage.json','w')
f.write(india_sip_money.to_json(force_ascii=False))
f.close()

In [82]:
f = open('brazil_percentage.json','w')
f.write(brazil_sip_money.to_json(force_ascii=False))
f.close()

In [83]:
f = open('china_percentage.json','w')
f.write(china_sip_money.to_json(force_ascii=False))
f.close()

In [None]:
pd.set_option('display.max_rows', None)
df2 = pd.read_html("https://www.moneydj.com/funddj/yb/YP302000.djhtm?a=ET003001")
df3 = pd.read_html("https://www.moneydj.com/funddj/yb/yp302000.djhtm?a=ET003002")
df4 = pd.read_html("https://www.moneydj.com/funddj/yb/yp302000.djhtm?a=ET003003")
dfFinal = pd.concat([df2[4], df3[4], df4[4]], axis=0)
dfFinal

In [None]:
for name in newAliveList:
    filterName = (dfFinal["基金名稱"]["基金名稱"] == name)
    try:
        print(dfFinal[filterName]["報酬率(%)"]["六個月"].values[0])
    except:
        print(name)