Import the libraries needed

In [1]:
import pandas as pd
import numpy as np
import os
import pyodbc
pyodbc.drivers()

from datetime import datetime
import psutil

# For the Yahoo Finance api
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override() # <== that's all it takes :-)



### All the data used has been provisined in Get_The_Data.ipynb

In [2]:
pd.set_option('max_row', None)
#pd.options.display.max_rows = 1000

# Set a value for the home folder.
home_folder = "."

# Set values for the various paths.
input_path = home_folder + "\data"

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

2024-07-19 18:41:18.592498
svmem(total=16756752384, available=4477337600, percent=73.3, used=12279414784, free=4477337600)


# 1. Get all iSuite policies that can have switchable funds.

In [3]:
filename = input_path + '\Full_Population.csv'
df_Full_Population = pd.read_csv(filename, delimiter=',',low_memory=False, header=0)
print(len(df_Full_Population.index))

print(filename)
df_Full_Population.head(5)

8411354
.\data\Full_Population.csv


Unnamed: 0,ContractNumber,ProdCat,ProdCode,PolicyStatus,CoverEndDt,SnapshotDt,SnapshotYrMth
0,6074160W,XRPP,XRPP_2005V1,I,2028-12-01,2016-07-31,201607
1,6074937L,XRPP,XRPP_2005V1,I,2033-12-01,2016-07-31,201607
2,6074962M,XRPP,XRPP_2005V1,I,2042-12-01,2016-07-31,201607
3,6075850C,XRPP,XRPP_2005V1,I,2034-12-01,2016-07-31,201607
4,6100638P,XRPP,XRPP_2005V1,I,2046-04-01,2016-07-31,201607


## Clean the data.

In [4]:
# Drop rows where policies are not in force
df_Count = df_Full_Population[df_Full_Population['PolicyStatus'] == 'I']

# Drop obsolete rows
df_Count = df_Count.drop(['ContractNumber'],axis=1)
df_Count = df_Count.drop(['ProdCat'],axis=1)
df_Count = df_Count.drop(['ProdCode'],axis=1)
df_Count = df_Count.drop(['PolicyStatus'],axis=1)
df_Count = df_Count.drop(['CoverEndDt'],axis=1)

# Reset teh index on count
df_Full_Population_Count=df_Count[['SnapshotYrMth']].value_counts().reset_index(name='counts')

# Sort by Snapshot Date
df_Full_Population_Count = df_Full_Population_Count.sort_values(by=['SnapshotYrMth'])

# Filter out old dates.
df_Full_Population_Count = df_Full_Population_Count[df_Full_Population_Count.SnapshotYrMth > 201812]

print(len(df_Full_Population_Count.index))
print(df_Full_Population_Count.dtypes)
df_Full_Population_Count.head(5)

# Write out to a comma separated values file.
filename = input_path + '\Full_Population_Count.csv'
df_Full_Population_Count.to_csv(filename, encoding='utf-8', index=False)      

print(filename)

66
SnapshotYrMth    int64
counts           int64
dtype: object
.\data\Full_Population_Count.csv


# 2. Get all fund switch records.

In [5]:
# giving directory name
All_Switches = input_path + '\All_Switches.csv'

df_All_Switches = pd.read_csv(All_Switches, delimiter=',',low_memory=False, header=0)

# Filter out old dates.
df_All_Switches = df_All_Switches[df_All_Switches.PROCESSING_DATE > '20181231']

# print stats.
print(len(df_All_Switches.index))
df_All_Switches.head(5)

201805


Unnamed: 0,index,POLICY_NUMBER,LEGACY_FUND_CODE,PRODUCT,PROCESSING_DATE,EFFECTIVE_DATE,UNIT_TYPE,CHARGE_TYPE,CASH_VALUE,RISK_COMMENCEMENT_DATE,Servicing_Broker_Code,Issue_Date,Cover_End_Date,Anniversary_Date,Source,SOURCE
0,0,6535239K,XMLEM,GNRP_2012V1,2019-01-02,2018-12-28,A,B,2763.74,2013-12-19,A28E,2013-12-19,1900-01-01,1900-01-01,,P
1,1,6535239K,XSK10,GNRP_2012V1,2019-01-02,2018-12-28,A,B,-5527.48,2013-12-19,A28E,2013-12-19,1900-01-01,1900-01-01,,P
2,2,6535239K,XSKN3,GNRP_2012V1,2019-01-02,2018-12-28,A,B,2763.74,2013-12-19,A28E,2013-12-19,1900-01-01,1900-01-01,,P
3,3,6562520M,XSDIV,GNRP_2012V1,2019-01-02,2018-12-27,A,B,10864.3901,2014-12-25,A947,2014-12-30,1900-01-01,1900-01-01,,P
4,4,6562520M,XSEDD,GNRP_2012V1,2019-01-02,2018-12-27,A,B,-10864.3901,2014-12-25,A947,2014-12-30,1900-01-01,1900-01-01,,P


## Create the Fund Switch Count dataframe.

In [6]:
df_All_Switches['PROCESSING_DATE'] = pd.to_datetime(df_All_Switches['PROCESSING_DATE'].str.strip(), format='%Y/%m/%d')
df_All_Switches['ProcessingtDt'] = df_All_Switches['PROCESSING_DATE'].dt.strftime('%Y%m')

# Drop obsolete columns.
df_Switches_Count = df_All_Switches.drop(['LEGACY_FUND_CODE','PRODUCT','EFFECTIVE_DATE','UNIT_TYPE',
                                          'CHARGE_TYPE','CASH_VALUE','RISK_COMMENCEMENT_DATE','Cover_End_Date',
                                         'index','Servicing_Broker_Code','Issue_Date','Anniversary_Date','SOURCE','PROCESSING_DATE'],axis=1)

# Stats
print(len(df_Switches_Count.index))
df_Switches_Count.head(5)

201805


Unnamed: 0,POLICY_NUMBER,Source,ProcessingtDt
0,6535239K,,201901
1,6535239K,,201901
2,6535239K,,201901
3,6562520M,,201901
4,6562520M,,201901


In [7]:
# Drop duplicate policy numbers
df_Switches_Count = df_Switches_Count.drop_duplicates(subset=['POLICY_NUMBER','ProcessingtDt'], keep='first')
df_Switches_Count = df_Switches_Count.drop(['POLICY_NUMBER'],axis=1)

# print stats.
print(len(df_Switches_Count.index))
df_Switches_Count.head(5)

7820


Unnamed: 0,Source,ProcessingtDt
0,,201901
3,,201901
5,,201901
9,,201901
144,,201901


In [8]:
# Reset teh index on count
df_Switches_Count=df_Switches_Count[['ProcessingtDt']].value_counts().reset_index(name='counts')

# Sort by ProcessingtDt
df_Switches_Count = df_Switches_Count.sort_values(by=['ProcessingtDt'])

print(len(df_Switches_Count.index))
df_Switches_Count.head(5)

67


Unnamed: 0,ProcessingtDt,counts
29,201901,75
63,201902,32
21,201903,91
48,201904,50
51,201905,48


## Merge the full population counts with the switch counts

In [9]:
# Rename the columns
df_Full_Population_Count.rename(columns = {'counts':'PopCount'}, inplace = True)
df_Switches_Count.rename(columns = {'counts':'SwitchCount'}, inplace = True)

#  Fill the NUL values
df_Switches_Count['ProcessingtDt'] = pd.to_numeric(df_Switches_Count['ProcessingtDt'], errors="coerce").fillna(0).astype('int64')


df_Horizon_Switch_Counts = pd.merge(df_Full_Population_Count, df_Switches_Count, left_on = ['SnapshotYrMth'], right_on = ['ProcessingtDt'] , how = 'left') 

# Drop obsolete columns.
df_Horizon_Switch_Counts = df_Horizon_Switch_Counts.drop(['ProcessingtDt'],axis=1)

# Replavce NULLs
df_Horizon_Switch_Counts['SwitchCount'] = df_Horizon_Switch_Counts['SwitchCount'].fillna(0)

# Types and row count
print(len(df_Full_Population_Count.index))
print(df_Full_Population_Count.dtypes)
print(len(df_Switches_Count.index))
print(df_Switches_Count.dtypes)
print(len(df_Horizon_Switch_Counts.index))
print(df_Horizon_Switch_Counts.dtypes)

df_Horizon_Switch_Counts.head(5)

66
SnapshotYrMth    int64
PopCount         int64
dtype: object
67
ProcessingtDt    int64
SwitchCount      int64
dtype: object
66
SnapshotYrMth    int64
PopCount         int64
SwitchCount      int64
dtype: object


Unnamed: 0,SnapshotYrMth,PopCount,SwitchCount
0,201901,62801,75
1,201902,63720,32
2,201903,64775,91
3,201904,65713,50
4,201905,66774,48


## Save the data to file.

In [10]:
# Write out to a comma separated values file.
filename = input_path + '\Horizon_Switch_Counts.csv'
df_Horizon_Switch_Counts.to_csv(filename, encoding='utf-8', index=False)        

print(filename)

# Types and row count
print(len(df_Horizon_Switch_Counts.index))
df_Horizon_Switch_Counts.head(5)

.\data\Horizon_Switch_Counts.csv
66


Unnamed: 0,SnapshotYrMth,PopCount,SwitchCount
0,201901,62801,75
1,201902,63720,32
2,201903,64775,91
3,201904,65713,50
4,201905,66774,48


In [11]:
%who_ls DataFrame 

['df_All_Switches',
 'df_Count',
 'df_Full_Population',
 'df_Full_Population_Count',
 'df_Horizon_Switch_Counts',
 'df_Switches_Count']

In [12]:
del df_All_Switches
del df_Count
del df_Full_Population
del df_Full_Population_Count
del df_Horizon_Switch_Counts
del df_Switches_Count

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

2024-07-19 18:42:33.001749
svmem(total=16756752384, available=4439175168, percent=73.5, used=12317577216, free=4439175168)
