In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
%matplotlib inline
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('claims_data.csv')

In [None]:
df.head()

In [None]:
def Risk_Weight(bmi):

    if bmi < 18.5:
        return 'underweight'
    elif bmi >= 18.5 and bmi <25:
        return 'Normal Weight'
    elif bmi >= 25.0 and bmi <30:
        return 'Overweight'
    else: 
        return 'Obesity'

In [None]:
df['weight_risk'] = df['bmi'].apply(Risk_Weight)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Modelling

Preprocessing

In [None]:
#Charging the data type to be in same category
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
encoder.fit(df['sex'].drop_duplicates())
df['sex']=encoder.transform(df['sex'])
encoder.fit(df['smoker'].drop_duplicates())
df['smoker']=encoder.transform(df['smoker'])
encoder.fit(df['insurance_claim'].drop_duplicates())
df['insurance_claim']=encoder.transform(df['insurance_claim'])
df1=pd.get_dummies(df['region'], prefix='region')
df= pd.concat([df,df1], axis=1).drop(['region'],axis=1)

In [None]:
df2=pd.get_dummies(df['weight_risk'], prefix='weight_risk')
df= pd.concat([df,df2], axis=1).drop(['weight_risk'],axis=1)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df['claims'] = np.where(df['insurance_claim'] == 1, 1, 0)
y=df['claims']
X=df.drop(['insurance_claim','claim_amount'], axis=1)

In [None]:
X.head()

In [None]:
df['sex_male'] = np.where(df['sex'] == 1, 1, 0)
df['sex_female'] = np.where(df['sex'] == 0, 1, 0)
df['smoker_yes'] = np.where(df['smoker'] == 1, 1, 0)
df['smoker_no'] = np.where(df['smoker'] == 0, 1, 0)
X_new=df.drop(['sex','smoker','insurance_claim','claim_amount'], axis=1)

In [None]:
X_new.head()

Preprocessing

In [None]:
X_Scale=X_new[['age','bmi','steps']]

In [None]:
s_scaler = preprocessing.StandardScaler()
Scaled_X = s_scaler.fit_transform(X_Scale)

In [None]:
X_final = pd.DataFrame(Scaled_X, columns=['age_n','bmi_n','steps_n'])

In [None]:
X_final.head()

In [None]:
X_Combine=pd.concat([X_new,X_final], axis=1)

In [None]:
X=X_Combine.drop(['age','bmi','steps','sex_female','smoker_no'],axis=1)

In [None]:
X

In [None]:
X['gender']=X['sex_male']
X['smoking']=X['smoker_yes']
X=X.drop(['sex_male','smoker_yes'], axis=1)

In [None]:
X.head()

In [None]:
X.to_csv('Scaled_CLaim_Data.csv')

Training and Testing the Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [None]:
lm = LogisticRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
lm.intercept_[0]

In [None]:
coeff_df = pd.DataFrame(lm.coef_.T,X.columns,columns=['Coefficient'])
coeff_df.to_csv('coeff_Mthoko.csv')

In [None]:
coeff_df 

In [None]:
pred_lm = lm.predict(X_test)
predictions=pd.DataFrame(pred_lm)
predictions.to_csv('predictions_Mthoko.csv')

In [None]:
predictions

Confussion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_lm)

In [None]:
labels = ['No Claim', 'Claim']

pd.DataFrame(data=confusion_matrix(y_test, pred_lm), index=labels, columns=labels)

Classification Report

In [None]:
from sklearn.metrics import classification_report
print('Classification Report')
print(classification_report(y_test, pred_lm, target_names=['No claim', 'Claim']))

Other Models

In [None]:
X.columns

In [None]:
#Basic Info features
X_info = X[['age_n','gender','region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']]
X_train_info, X_test_info, y_train, y_test = train_test_split(X_info, y, test_size=0.2, random_state=100)

In [None]:
#Lifestyle features
X_life = X[['bmi_n','steps_n','children','smoking']]
X_train_life, X_test_life, y_train, y_test = train_test_split(X_life, y, test_size=0.2, random_state=100)

In [None]:
#Risk features
X_risk = X[['age_n', 'bmi_n','smoking', 'weight_risk_Obesity', 'weight_risk_Overweight', 'weight_risk_underweight','weight_risk_Normal Weight']]
X_train_risk, X_test_risk, y_train, y_test = train_test_split(X_risk, y, test_size=0.2, random_state=100)

In [None]:
#SM features
X_SM = X[['age_n', 'bmi_n','smoking', 'steps_n', 'weight_risk_Overweight', 'weight_risk_Obesity']]
X_train_SM, X_test_SM, y_train, y_test = train_test_split(X_SM, y, test_size=0.2, random_state=100)

In [None]:
info = LogisticRegression(C = 10)
life = LogisticRegression(C = 10)
risk = LogisticRegression(C = 10)
SM = LogisticRegression(C = 10)

In [None]:
info.fit(X_train_info, y_train)

In [None]:
life.fit(X_train_life, y_train)

In [None]:
risk.fit(X_train_risk, y_train)

In [None]:
SM.fit(X_train_SM, y_train)

In [None]:
pred_info = info.predict(X_test_info)
pred_life = life.predict(X_test_life)
pred_risk = risk.predict(X_test_risk)
pred_SM = SM.predict(X_test_SM)

In [None]:
info_pred=pd.DataFrame(pred_info)
life_pred=pd.DataFrame(pred_life)
risk_pred=pd.DataFrame(pred_risk)
SM_pred=pd.DataFrame(pred_SM)

In [None]:
info_pred.to_csv('info_Mthoko_pred.csv')
life_pred.to_csv('life_Mthoko_pred.csv')
risk_pred.to_csv('risk_Mthoko_pred.csv')
SM_pred.to_csv('SM_Mthoko_pred.csv')

In [None]:
print('Info Model')
print(classification_report(y_test, pred_info, target_names=['No claim', 'Claim']))

print()
labels = ['No claim', 'Claim']
pd.DataFrame(data=confusion_matrix(y_test, pred_info), index=labels, columns=labels)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, pred_info))

In [None]:
print('Lifestyle Model')
print(classification_report(y_test, pred_life, target_names=['No claim', 'Claim']))

print()
labels = ['No claim', 'Claim']
pd.DataFrame(data=confusion_matrix(y_test, pred_life), index=labels, columns=labels)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, pred_life))

In [None]:
print('Risk Model')
print(classification_report(y_test, pred_risk, target_names=['No claim', 'Claim']))

print()
labels = ['No claim', 'Claim']
pd.DataFrame(data=confusion_matrix(y_test, pred_risk), index=labels, columns=labels)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, pred_risk))

In [None]:
print('SM Model')
print(classification_report(y_test, pred_SM, target_names=['No claim', 'Claim']))

print()
labels = ['No claim', 'Claim']
pd.DataFrame(data=confusion_matrix(y_test, pred_SM), index=labels, columns=labels)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, pred_SM))

In [None]:
#For LifeStyle features
plt.scatter(X_life,y)
plt.xlabel("Lifestlye Features")
plt.ylabel("Claim")