In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("synthetic_insurance_dirty.csv")

# Quick look
print(df.shape)
print(df.head())
print(df.info())
print(df.isnull().sum())  # check missing values


(510, 6)
    Age  Gender Policy_Type Medical_History  Premium  Claim
0  56.0       M    Standard            NONE   6845.0    1.0
1  69.0       M         NaN             NaN   5246.0    0.0
2  46.0  Female         NaN            NONE   2940.0    1.0
3  32.0  FEMALE    Standard           Heart   5506.0    0.0
4  60.0  Female         NaN        Diabetes   4761.0    1.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              509 non-null    float64
 1   Gender           510 non-null    object 
 2   Policy_Type      407 non-null    object 
 3   Medical_History  348 non-null    object 
 4   Premium          508 non-null    float64
 5   Claim            505 non-null    float64
dtypes: float64(3), object(3)
memory usage: 24.0+ KB
None
Age                  1
Gender               0
Policy_Type        103
Medical_History    162
Premium  

In [4]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Premium'] = df['Premium'].fillna(df['Premium'].median())
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Policy_Type'] = df['Policy_Type'].fillna(df['Policy_Type'].mode()[0])
df['Medical_History'] = df['Medical_History'].fillna(df['Medical_History'].mode()[0])
df['Claim'] = df['Claim'].fillna(0)

In [5]:
# Normalize Gender
df['Gender'] = df['Gender'].str.upper().replace({
    'M': 'MALE', 'F': 'FEMALE', 'MALE': 'MALE', 'FEMALE': 'FEMALE'
})

# Normalize Policy Type
df['Policy_Type'] = df['Policy_Type'].str.title().replace({
    'Basics': 'Basic'
})

# Normalize Medical History
df['Medical_History'] = df['Medical_History'].str.title().replace({
    'None': 'None'
})


In [6]:
df = df[(df['Age'] >= 18) & (df['Age'] <= 100)]
df = df[(df['Premium'] > 0) & (df['Premium'] < 100000)]


In [7]:
df.drop_duplicates(inplace=True)


In [8]:
df['Gender'] = df['Gender'].map({'MALE': 0, 'FEMALE': 1})
df['Policy_Type'] = df['Policy_Type'].map({'Basic': 0, 'Standard': 1, 'Premium': 2})
df['Medical_History'] = df['Medical_History'].map({'None': 0, 'Diabetes': 1, 'Heart': 2, 'Chronic': 3})


In [9]:
print(df.info())
print(df.describe())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 493 entries, 0 to 499
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              493 non-null    float64
 1   Gender           493 non-null    int64  
 2   Policy_Type      493 non-null    int64  
 3   Medical_History  493 non-null    int64  
 4   Premium          493 non-null    float64
 5   Claim            493 non-null    float64
dtypes: float64(3), int64(3)
memory usage: 27.0 KB
None
              Age      Gender  Policy_Type  Medical_History      Premium  \
count  493.000000  493.000000   493.000000       493.000000   493.000000   
mean    50.162272    0.501014     0.817444         1.985801  4927.991886   
std     18.453249    0.500507     0.742948         1.179862  1484.290741   
min     18.000000    0.000000     0.000000         0.000000  2201.000000   
25%     35.000000    0.000000     0.000000         1.000000  3924.000000   
50%     50.000000    

In [10]:
print(df.isnull().sum()) 

Age                0
Gender             0
Policy_Type        0
Medical_History    0
Premium            0
Claim              0
dtype: int64


In [15]:

# --- Step 2: Define Rule-Based Risk Scoring ---
def risk_score(row):
    score = 0
    if row['Age'] > 50:
        score += 1
    if row['Medical_History'] != "None":
        score += 2
    if row['Policy_Type'] == "Premium":
        score += 1
    if row['Premium'] > df['Premium'].median():
        score += 1
    return score

df['Risk_Score'] = df.apply(risk_score, axis=1)

# --- Step 3: Convert Risk Score → Risk Level ---
def risk_level(score):
    if score <= 1:
        return "Low"
    elif score == 2:
        return "Medium"
    else:
        return "High"

df['Predicted_Risk'] = df['Risk_Score'].apply(risk_level)

# --- Step 4: Save Results ---
df.to_csv("insurance_with_predictions.csv", index=False)

print(df[['Age', 'Policy_Type', 'Medical_History', 'Premium', 'Risk_Score', 'Predicted_Risk']].head(10))


    Age Policy_Type Medical_History    Premium  Risk_Score Predicted_Risk
0  56.0    Standard            NONE     6845.0           4           High
1  69.0    Standard            None     5246.0           2         Medium
2  46.0    Standard            NONE     2940.0           2         Medium
3  32.0    Standard           Heart     5506.0           3           High
4  60.0    Standard        Diabetes     4761.0           3           High
5  25.0     Premium           Heart     8046.0           4           High
6  78.0    Standard            None     5881.0           2         Medium
7  38.0    Standard         Chronic     4788.0           2         Medium
8  56.0    Standard            NONE     4400.0           3           High
9  75.0      basics            NONE  1000000.0           4           High


In [16]:
def predict_claim_risk(age, gender, policy_type, medical_history, premium):
    score = 0
    
    # Rules
    if age > 50:
        score += 1
    if medical_history != "None":
        score += 2
    if policy_type == "Premium":
        score += 1
    if premium > 600:
        score += 1
    
    # Map score → Risk Level
    if score <= 1:
        return "Low Risk"
    elif score == 2:
        return "Medium Risk"
    else:
        return "High Risk"


In [17]:
print(predict_claim_risk(55, "Male", "Premium", "Diabetes", 800))
# Output: High Risk

print(predict_claim_risk(30, "Female", "Basic", "None", 400))
# Output: Low Risk


High Risk
Low Risk
