Import the libraries needed

In [1]:
import pandas as pd
#from pandas.compat import StringIO
from pandas.tseries.offsets import DateOffset

from datetime import datetime
import psutil


import numpy as np
import os
import pyodbc
pyodbc.drivers()

import csv

from sklearn.preprocessing import StandardScaler
import seaborn as sns

pd.options.display.max_rows = 1000

# For the Yahoo Finance api
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override() 




### Database connections and SQL to retrieve the various data needed.

In [2]:
SQL_server = 'WINPRDAI1572\CLESQLDW'
Reporting_database = 'CLE_Reporting'

pd.set_option('max_row', None)
#pd.options.display.max_rows = 1000

# Set a value for the home folder.
home_folder = "."

# Set values for the various paths.
input_path = home_folder + "\data"

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

2024-07-19 19:08:35.557796
svmem(total=16756752384, available=3310739456, percent=80.2, used=13446012928, free=3310739456)


# 1. Get supplementary policy features - Premium and Indexation.

  -  Identified in Lit Review.# 1. Read back the population with switches data.

In [3]:
conn = pyodbc.connect(
         'DRIVER={ODBC Driver 17 for SQL Server};'
         'SERVER='+SQL_server+';'
         'DATABASE='+Reporting_database+';'
         'Trusted_Connection=yes;')

premium_query = f"""SELECT pol.Policy_Number                AS Contract_Number
	                      ,pol.Premiums_Per_Year            AS PayFreq
	                      ,pol.Anniversary_Date             AS AnniversaryDt
	                      ,pol.Indexation_Percentage        AS IndexationPercent 
	                      ,pol.Indexation_Start_Date        AS IndexationStartDt 
	                      ,pol.Premium_Holiday_Start        AS PremHolStartDt
	                      ,pol.Premium_Holiday_End          AS PremHolEndDt
	                      ,pol.Regular_Premium              AS Premium
                          ,pol.Premium_Calc_Method          AS PremiumCalcMethod
                 FROM CLT_Policy pol
                 INNER JOIN CLE_Reporting.dbo.CLT_Product prd ON prd.Product_Id = pol.Product_Id
                 WHERE prd.Product_Code IN ('FKP_2006V1','FKP_2006V2','FKP_2008V1','FKP_2010V1','FKP_2012V1','FKP_2017V1',
                                            'GNPP_2016V1','GNPP_2017V1','GNPP_2022V1','GNPP_2023V1',
                                            'GNRP_2008V1','GNRP_2010V1','GNRP_2012V1','GNRP_2017V1','GNRP_2022V1',
                                            'XRPP_2005V1','XRPP_2008V1','XRPP_2010V1','XRPP_2012V1','XRPP_2015V1')
                 ORDER BY pol.Policy_Number;"""

df_premium = pd.read_sql(premium_query, conn)

# Types and row count
# print stats.
print(len(df_premium.index))
now = datetime.now()
print(now)
print('==========================')

print(df_premium.dtypes)
df_premium.head(5)

136827
2024-07-19 19:08:45.657423
Contract_Number       object
PayFreq                int64
AnniversaryDt         object
IndexationPercent    float64
IndexationStartDt     object
PremHolStartDt        object
PremHolEndDt          object
Premium              float64
PremiumCalcMethod     object
dtype: object


Unnamed: 0,Contract_Number,PayFreq,AnniversaryDt,IndexationPercent,IndexationStartDt,PremHolStartDt,PremHolEndDt,Premium,PremiumCalcMethod
0,6074160W,12,2007-12-01,0.07,2007-12-01,,,221.4,Beitragsvorgabe
1,6074937L,12,2006-12-01,0.07,2006-12-01,,,157.97,Beitragsvorgabe
2,6074962M,12,2006-12-01,0.07,2006-12-01,,,157.97,Beitragsvorgabe
3,6075850C,12,2006-12-01,0.07,2006-12-01,,,0.0,Beitragsvorgabe
4,6100638P,12,2007-04-01,0.07,2007-04-01,,,137.98,Beitragsvorgabe


## Drop obsolete or repeated features.

## Check for nulls

In [4]:
df_premium.dtypes

Contract_Number       object
PayFreq                int64
AnniversaryDt         object
IndexationPercent    float64
IndexationStartDt     object
PremHolStartDt        object
PremHolEndDt          object
Premium              float64
PremiumCalcMethod     object
dtype: object

In [5]:
df_premium.isna().any()

Contract_Number      False
PayFreq              False
AnniversaryDt         True
IndexationPercent     True
IndexationStartDt     True
PremHolStartDt        True
PremHolEndDt          True
Premium              False
PremiumCalcMethod    False
dtype: bool

## Replace any nulls before checking the distributions of values for each column.

In [6]:
df_premium['IndexationPercent'] = df_premium['IndexationPercent'].fillna(0).apply(np.int64)
df_premium['IndexationPercent'] = df_premium['IndexationPercent'].fillna(0).apply(np.int64)

## Check the distributions of values for each column.

In [7]:
print('IndexationPercent : ',df_premium['IndexationPercent'].value_counts())

IndexationPercent :  0    136827
Name: IndexationPercent, dtype: int64


## Count does not show it but IndexationPercent has a distribution so leave it in.

# write to Excel

In [8]:
# Write out to a comma separated values file.

# giving directory name
filename = input_path + '\Premium_data.csv'
df_premium.to_csv(filename, encoding='utf-8', index=False)        
df_premium.head(10)

Unnamed: 0,Contract_Number,PayFreq,AnniversaryDt,IndexationPercent,IndexationStartDt,PremHolStartDt,PremHolEndDt,Premium,PremiumCalcMethod
0,6074160W,12,2007-12-01,0,2007-12-01,,,221.4,Beitragsvorgabe
1,6074937L,12,2006-12-01,0,2006-12-01,,,157.97,Beitragsvorgabe
2,6074962M,12,2006-12-01,0,2006-12-01,,,157.97,Beitragsvorgabe
3,6075850C,12,2006-12-01,0,2006-12-01,,,0.0,Beitragsvorgabe
4,6100638P,12,2007-04-01,0,2007-04-01,,,137.98,Beitragsvorgabe
5,6100746E,12,2006-05-01,0,2006-05-01,,,165.54,Beitragsvorgabe
6,6214246Y,12,2005-10-01,0,2005-10-01,,,178.22,Beitragsvorgabe
7,6215630F,12,2005-12-01,0,2005-12-01,,,113.45,Beitragsvorgabe
8,6215740Y,12,2005-10-01,0,2005-10-01,,,109.15,Beitragsvorgabe
9,6215812A,12,2005-10-01,0,2005-10-01,,,120.34,Beitragsvorgabe


In [9]:
%who_ls DataFrame 

['df_premium']

In [10]:
del df_premium

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

2024-07-19 19:08:47.226697
svmem(total=16756752384, available=3185242112, percent=81.0, used=13571510272, free=3185242112)
