Import the libraries needed

In [1]:
import pandas as pd
import numpy as np
import os
import pyodbc
pyodbc.drivers()

from datetime import datetime
import psutil

# For the Yahoo Finance api
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override() # <== that's all it takes :-)



### All the data used has been provisined in Get_The_Data.ipynb

In [2]:
pd.set_option('max_row', None)
#pd.options.display.max_rows = 1000

# Set a value for the home folder.
home_folder = "."

# Set values for the various paths.
input_path = home_folder + "\data"

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

2024-08-05 22:10:37.078179
svmem(total=16756752384, available=8607416320, percent=48.6, used=8149336064, free=8607416320)


# 1. Get all iSuite policies that can have switchable funds.

In [3]:
filename = input_path + '\Full_Population.csv'
df_Full_Population = pd.read_csv(filename, delimiter=',',low_memory=False, header=0)
print(len(df_Full_Population.index))

print(filename)

# Turn off dislays to protect PII
#df_Full_Population.head(5)

8411354
.\data\Full_Population.csv


## Clean the data.

In [4]:
# Drop rows where policies are not in force
df_Count = df_Full_Population[df_Full_Population['PolicyStatus'] == 'I']

# Drop obsolete rows
df_Count = df_Count.drop(['ContractNumber'],axis=1)
df_Count = df_Count.drop(['ProdCat'],axis=1)
df_Count = df_Count.drop(['ProdCode'],axis=1)
df_Count = df_Count.drop(['PolicyStatus'],axis=1)
df_Count = df_Count.drop(['CoverEndDt'],axis=1)

# Reset teh index on count
df_Full_Population_Count=df_Count[['SnapshotYrMth']].value_counts().reset_index(name='counts')

# Sort by Snapshot Date
df_Full_Population_Count = df_Full_Population_Count.sort_values(by=['SnapshotYrMth'])

# Filter out old dates.
df_Full_Population_Count = df_Full_Population_Count[df_Full_Population_Count.SnapshotYrMth > 201812]

print(len(df_Full_Population_Count.index))
print(df_Full_Population_Count.dtypes)
df_Full_Population_Count.head(5)

# Write out to a comma separated values file.
filename = input_path + '\Full_Population_Count.csv'
df_Full_Population_Count.to_csv(filename, encoding='utf-8', index=False)      

print(filename)

66
SnapshotYrMth    int64
counts           int64
dtype: object
.\data\Full_Population_Count.csv


# 2. Get all fund switch records.

In [5]:
# giving directory name
All_Switches = input_path + '\All_Switches.csv'

df_All_Switches = pd.read_csv(All_Switches, delimiter=',',low_memory=False, header=0)

# Filter out old dates.
df_All_Switches = df_All_Switches[df_All_Switches.PROCESSING_DATE > '20181231']

# print stats.
print(len(df_All_Switches.index))

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# Turn off dislays to protect PII
#df_All_Switches.head(5)

201805
2024-08-05 22:11:26.311259
svmem(total=16756752384, available=7925149696, percent=52.7, used=8831602688, free=7925149696)


## Create the Fund Switch Count dataframe.

In [6]:
df_All_Switches['PROCESSING_DATE'] = pd.to_datetime(df_All_Switches['PROCESSING_DATE'].str.strip(), format='%Y/%m/%d')
df_All_Switches['ProcessingtDt'] = df_All_Switches['PROCESSING_DATE'].dt.strftime('%Y%m')

# Drop obsolete columns.
df_Switches_Count = df_All_Switches.drop(['LEGACY_FUND_CODE','PRODUCT','EFFECTIVE_DATE','UNIT_TYPE',
                                          'CHARGE_TYPE','CASH_VALUE','RISK_COMMENCEMENT_DATE','Cover_End_Date',
                                         'index','Servicing_Broker_Code','Issue_Date','Anniversary_Date','SOURCE','PROCESSING_DATE'],axis=1)

# Stats
print(len(df_Switches_Count.index))

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# Turn off dislays to protect PII
#df_Switches_Count.head(5)

201805
2024-08-05 22:11:33.171509
svmem(total=16756752384, available=7880830976, percent=53.0, used=8875921408, free=7880830976)


In [7]:
# Drop duplicate policy numbers
df_Switches_Count = df_Switches_Count.drop_duplicates(subset=['POLICY_NUMBER','ProcessingtDt'], keep='first')
df_Switches_Count = df_Switches_Count.drop(['POLICY_NUMBER'],axis=1)

# print stats.
print(len(df_Switches_Count.index))

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# Turn off dislays to protect PII
#df_Switches_Count.head(5)

7820
2024-08-05 22:11:36.983044
svmem(total=16756752384, available=7882641408, percent=53.0, used=8874110976, free=7882641408)


In [8]:
# Reset teh index on count
df_Switches_Count=df_Switches_Count[['ProcessingtDt']].value_counts().reset_index(name='counts')

# Sort by ProcessingtDt
df_Switches_Count = df_Switches_Count.sort_values(by=['ProcessingtDt'])

print(len(df_Switches_Count.index))

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# Turn off dislays to protect PII
#df_Switches_Count.head(5)

67
2024-08-05 22:11:40.875632
svmem(total=16756752384, available=7888547840, percent=52.9, used=8868204544, free=7888547840)


## Merge the full population counts with the switch counts

In [9]:
# Rename the columns
df_Full_Population_Count.rename(columns = {'counts':'PopCount'}, inplace = True)
df_Switches_Count.rename(columns = {'counts':'SwitchCount'}, inplace = True)

#  Fill the NUL values
df_Switches_Count['ProcessingtDt'] = pd.to_numeric(df_Switches_Count['ProcessingtDt'], errors="coerce").fillna(0).astype('int64')


df_Horizon_Switch_Counts = pd.merge(df_Full_Population_Count, df_Switches_Count, left_on = ['SnapshotYrMth'], right_on = ['ProcessingtDt'] , how = 'left') 

# Drop obsolete columns.
df_Horizon_Switch_Counts = df_Horizon_Switch_Counts.drop(['ProcessingtDt'],axis=1)

# Replavce NULLs
df_Horizon_Switch_Counts['SwitchCount'] = df_Horizon_Switch_Counts['SwitchCount'].fillna(0)

# Types and row count
print(len(df_Full_Population_Count.index))
print(df_Full_Population_Count.dtypes)
print(len(df_Switches_Count.index))
print(df_Switches_Count.dtypes)
print(len(df_Horizon_Switch_Counts.index))
print(df_Horizon_Switch_Counts.dtypes)


print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# Turn off dislays to protect PII
#df_Horizon_Switch_Counts.head(5)

66
SnapshotYrMth    int64
PopCount         int64
dtype: object
67
ProcessingtDt    int64
SwitchCount      int64
dtype: object
66
SnapshotYrMth    int64
PopCount         int64
SwitchCount      int64
dtype: object
2024-08-05 22:11:45.617531
svmem(total=16756752384, available=7889879040, percent=52.9, used=8866873344, free=7889879040)


## Save the data to file.

In [10]:
# Write out to a comma separated values file.
filename = input_path + '\Horizon_Switch_Counts.csv'
df_Horizon_Switch_Counts.to_csv(filename, encoding='utf-8', index=False)        

print(filename)

# Types and row count
print(len(df_Horizon_Switch_Counts.index))

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# Turn off dislays to protect PII
#df_Horizon_Switch_Counts.head(5)

.\data\Horizon_Switch_Counts.csv
66
2024-08-05 22:11:50.454841
svmem(total=16756752384, available=7896674304, percent=52.9, used=8860078080, free=7896674304)


In [11]:
%who_ls DataFrame 

['df_All_Switches',
 'df_Count',
 'df_Full_Population',
 'df_Full_Population_Count',
 'df_Horizon_Switch_Counts',
 'df_Switches_Count']

In [12]:
del df_All_Switches
del df_Count
del df_Full_Population
del df_Full_Population_Count
del df_Horizon_Switch_Counts
del df_Switches_Count

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

2024-08-05 22:11:54.655664
svmem(total=16756752384, available=8662839296, percent=48.3, used=8093913088, free=8662839296)
