In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

In [5]:
df_data = pd.read_excel(r'data/topcover_pilot.xlsx')

df_data

Unnamed: 0,Pno,Amount,Gender,RG,Cover,SurrVal,Offer/APE,PercCover,CurrAge,Fee_claimed,Take-up ind,Mngt_act
0,1,6444,M,4,48024,0,1.5,0.134183,71.250000,0,0,BCD
1,2,9649,F,7,1000000,0,1.3,0.009649,60.916667,0,0,PIN
2,3,8688,M,8,800001,2250,1.1,0.010860,63.416667,0,0,PIN
3,4,7483,F,4,21713,0,4.7,0.344632,82.916667,0,0,BCD
4,5,8115,M,8,526919,0,1.3,0.015401,60.000000,0,0,BCD
...,...,...,...,...,...,...,...,...,...,...,...,...
1089,1090,21024,M,8,122000,0,7.3,0.172328,77.500000,0,1,PIN
1090,1091,116604,F,8,1603567,9398,6.7,0.072715,68.666667,0,1,
1091,1092,33361,M,8,650000,3390,15.5,0.051325,42.333333,1,1,PIN
1092,1093,41296,M,8,188118,0,15.0,0.219522,77.583333,0,1,PIN


In [15]:
df_data['Smoker ind'] = df_data['RG']%2

def calc_socio_econ_class(x : int) -> int:
    if x <= 2:
        return 1
    elif x <= 4:
        return 2
    elif x <= 6:
        return 3
    elif x <= 8:
        return 4
    
df_data['Socio Economic Class'] = df_data['RG'].apply(calc_socio_econ_class)



Unnamed: 0,Pno,Amount,Gender,RG,Cover,SurrVal,Offer/APE,PercCover,CurrAge,Fee_claimed,Take-up ind,Mngt_act,Smoker ind,Socio Economic Class
0,1,6444,M,4,48024,0,1.5,0.134183,71.250000,0,0,BCD,0,2
1,2,9649,F,7,1000000,0,1.3,0.009649,60.916667,0,0,PIN,1,4
2,3,8688,M,8,800001,2250,1.1,0.010860,63.416667,0,0,PIN,0,4
3,4,7483,F,4,21713,0,4.7,0.344632,82.916667,0,0,BCD,0,2
4,5,8115,M,8,526919,0,1.3,0.015401,60.000000,0,0,BCD,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089,1090,21024,M,8,122000,0,7.3,0.172328,77.500000,0,1,PIN,0,4
1090,1091,116604,F,8,1603567,9398,6.7,0.072715,68.666667,0,1,,0,4
1091,1092,33361,M,8,650000,3390,15.5,0.051325,42.333333,1,1,PIN,0,4
1092,1093,41296,M,8,188118,0,15.0,0.219522,77.583333,0,1,PIN,0,4


In [6]:
# Basically everything except for policy number - atleast for now
features = ['Amount', 'Gender', 'RG', 'Cover', 'SurrVal', 'Offer/APE',
            'PercCover', 'CurrAge', 'Fee_claimed', 'Take-up ind', 'Mngt_act']

df_data_split= pd.get_dummies(df_data[features],columns=['Gender', 'RG', 'Mngt_act']).astype(int)

df_data_split


Unnamed: 0,Amount,Cover,SurrVal,Offer/APE,PercCover,CurrAge,Fee_claimed,Take-up ind,Gender_F,Gender_M,...,RG_5,RG_6,RG_7,RG_8,Mngt_act_,Mngt_act_ ACD,Mngt_act_ BCD,Mngt_act_ P5C,Mngt_act_ P5P,Mngt_act_ PIN
0,6444,48024,0,1,0,71,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,9649,1000000,0,1,0,60,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,8688,800001,2250,1,0,63,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,7483,21713,0,4,0,82,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,8115,526919,0,1,0,60,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089,21024,122000,0,7,0,77,0,1,0,1,...,0,0,0,1,0,0,0,0,0,1
1090,116604,1603567,9398,6,0,68,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
1091,33361,650000,3390,15,0,42,1,1,0,1,...,0,0,0,1,0,0,0,0,0,1
1092,41296,188118,0,15,0,77,0,1,0,1,...,0,0,0,1,0,0,0,0,0,1


In [7]:
X = df_data_split.drop(columns=['Take-up ind'])
y = df_data_split['Take-up ind']

# Split the dataset using the train_test_split function
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=7)

print('X_train has %d rows \ny_train has %d rows \nX_val has %d rows \ny_val has %d' 
      %(X_train.shape[0], y_train.shape[0], X_val.shape[0],y_val.shape[0]) )

X_train has 875 rows 
y_train has 875 rows 
X_val has 219 rows 
y_val has 219


In [8]:
logreg = LogisticRegression(random_state=7, max_iter=500)

# Fit and save the logistic regression model using the training data
model = logreg.fit(X_train, y_train)

In [9]:
# Validate the model
train_pred = model.predict(X_train)
train_score = accuracy_score(y_train, train_pred)

test_pred = model.predict(X_val)
test_score = accuracy_score(y_val, test_pred)

print(f'Traing Data Score:\t{train_score}')
print(f'Testing Data Score:\t{test_score}')

Traing Data Score:	0.9142857142857143
Testing Data Score:	0.9360730593607306


In [10]:
predictions = model.predict(X_val)

cm = confusion_matrix(predictions, y_val)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,200,14
Actual 1,0,5
