# **1) Insurance Risk & Claim Dataset**

# **Description:** Contains historical insurance policyholder details, claims history,and fraudulent claims.

In [None]:
!pip install faker


Collecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.6.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.6.0


In [None]:
import pandas as pd
import random
from faker import Faker

fake = Faker()
Faker.seed(42)
random.seed(42)

def generate_data(n=1000):
    data = []
    policy_types = ['Health', 'Auto', 'Life', 'Property']
    risk_scores = ['Low', 'Medium', 'High']

    for _ in range(n):
        policy_id = fake.uuid4()
        age = random.randint(18, 80)
        gender = random.choice(['Male', 'Female', 'Other'])
        policy_type = random.choice(policy_types)
        annual_income = round(random.uniform(20000, 200000), 2)
        vehicle_property_age = random.randint(0, 30)
        claim_history = random.randint(0, 5)
        fraudulent_claim = random.choice([0, 1])
        premium_amount = round(random.uniform(500, 5000), 2)
        claim_amount = round(random.uniform(0, 50000), 2)
        risk_score = random.choices(risk_scores, weights=[0.5, 0.3, 0.2])[0]

        data.append([
            policy_id, age, gender, policy_type, annual_income, vehicle_property_age,
            claim_history, fraudulent_claim, premium_amount, claim_amount, risk_score
        ])

    df = pd.DataFrame(data, columns=[
        'Policy_ID', 'Customer_Age', 'Gender', 'Policy_Type', 'Annual_Income',
        'Vehicle_Age_Property_Age', 'Claim_History', 'Fraudulent_Claim',
        'Premium_Amount', 'Claim_Amount', 'Risk_Score'
    ])

    return df

# Generate 1000 rows of fake data
df = generate_data(1000)
print(df.head())

# Save to CSV
df.to_csv("insurance_fraud_data.csv", index=False)


                              Policy_ID  Customer_Age Gender Policy_Type  \
0  bdd640fb-0667-4ad1-9c80-317fa3b1799d            58   Male      Health   
1  23b8c1e9-3924-46de-beb1-3b9046685257            23  Other    Property   
2  bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9            59  Other    Property   
3  972a8469-1641-4f82-8b9d-2434e465e150            35   Male        Auto   
4  17fc695a-07a0-4a6e-8822-e8f36c031199            34   Male    Property   

   Annual_Income  Vehicle_Age_Property_Age  Claim_History  Fraudulent_Claim  \
0      153479.09                         7              1                 0   
1       25720.88                         2              1                 0   
2       59679.31                        18              2                 0   
3      192298.35                        10              0                 0   
4      116521.06                        29              3                 0   

   Premium_Amount  Claim_Amount Risk_Score  
0         3814.12      

# **2) Customer Feedback & Sentiment Dataset**

# **Description**: Contains customer feedback, complaints, and sentiments extracted from reviews and social media.

In [None]:
# Create columns for

# ● Review_ID (Unique Identifier)
# ● Customer_ID
# ● Review_Text (Unstructured Text)
# ● Sentiment_Label (Positive, Negative, Neutral)
# ● Rating (1-5 stars)
# ● Service_Type (Claim, Policy Purchase, Customer Support)

In [None]:
import pandas as pd
import random
from faker import Faker

fake = Faker()
Faker.seed(42)
random.seed(42)

def generate_data(n=1000):
    data = []
    service_types = ['Claim', 'Policy Purchase', 'Customer Support']
    sentiments = ['Positive', 'Negative', 'Neutral']

    for _ in range(n):
        review_id = fake.uuid4()
        customer_id = fake.uuid4()
        review_text = fake.text()
        sentiment_label = random.choice(sentiments)
        rating = random.randint(1, 5)
        service_type = random.choice(service_types)

        data.append([
            review_id, customer_id, review_text, sentiment_label, rating, service_type
        ])

    df = pd.DataFrame(data, columns=[
        'Review_ID', 'Customer_ID', 'Review_Text', 'Sentiment_Label', 'Rating', 'Service_Type'
    ])

    return df

# Generate 1000 rows of fake data
df = generate_data(1000)
print(df.head())

# Save to CSV
df.to_csv("insurance_reviews_data.csv", index=False)


                              Review_ID                           Customer_ID  \
0  bdd640fb-0667-4ad1-9c80-317fa3b1799d  23b8c1e9-3924-46de-beb1-3b9046685257   
1  12476f57-a5e5-45ab-aefc-fad8efc89849  88bd6407-2bcf-4e01-a28d-efe39bf00273   
2  cac5b68c-28f4-4481-a0a0-4dc427209bdf  10435a10-98ae-4334-ac12-ace8ae340454   
3  913e4de2-e0c5-4cb8-bda9-c2a90ed42f1a  bb5e4bcf-15ed-4269-9429-6c07f26b4776   
4  dfde4fbf-3ff3-40bf-b66e-cb15474ebc19  ceda8bbb-7171-4434-934c-6c92ec5b227c   

                                         Review_Text Sentiment_Label  Rating  \
0  Beautiful instead ahead despite measure ago cu...         Neutral       1   
1  Left establish understand read. Range successf...         Neutral       3   
2  Other life edge network wall quite. Race Mr en...        Positive       2   
3  Within mouth call process. Close month parent ...        Positive       5   
4  Anything yourself structure why. Coach magazin...         Neutral       4   

       Service_Type  
0         

# **3) Fraudulent Insurance Claims Dataset**

# **Description**: Contains historical fraudulent claims with **anomaly detection labels.**

In [None]:
# create columns for

# ● Claim_ID
# ● Claim_Date
# ● Policyholder_ID
# ● Claim_Amount
# ● Claim_Type (Medical, Vehicle, Home Damage)
# ● Suspicious_Flags (Boolean)
# ● Fraud_Label (1 = Fraud, 0 = Genuine)

In [None]:
import pandas as pd
import random
from faker import Faker

fake = Faker()
Faker.seed(42)
random.seed(42)

def generate_data(n=1000):
    data = []
    claim_types = ['Medical', 'Vehicle', 'Home Damage']

    for _ in range(n):
        claim_id = fake.uuid4()
        claim_date = fake.date_between(start_date='-5y', end_date='today')
        policyholder_id = fake.uuid4()
        claim_amount = round(random.uniform(500, 50000), 2)
        claim_type = random.choice(claim_types)
        suspicious_flags = random.choice([True, False])
        fraud_label = random.choice([0, 1])

        data.append([
            claim_id, claim_date, policyholder_id, claim_amount, claim_type,
            suspicious_flags, fraud_label
        ])

    df = pd.DataFrame(data, columns=[
        'Claim_ID', 'Claim_Date', 'Policyholder_ID', 'Claim_Amount', 'Claim_Type',
        'Suspicious_Flags', 'Fraud_Label'
    ])

    return df

# Generate 1000 rows of fake data
df = generate_data(1000)
print(df.head())

# Save to CSV
df.to_csv("insurance_claims_data.csv", index=False)


                               Claim_ID  Claim_Date  \
0  bdd640fb-0667-4ad1-9c80-317fa3b1799d  2022-01-19   
1  8b9d2434-e465-4150-bd9c-66b3ad3c2d6d  2021-02-09   
2  9a1de644-815e-46d1-bb8f-aa1837f8a88b  2020-10-22   
3  72ff5d2a-386e-4be0-ab65-a6a48b8148f6  2023-08-16   
4  6c307511-b2b9-437a-a8df-6ec4ce4a2bbd  2022-05-18   

                        Policyholder_ID  Claim_Amount   Claim_Type  \
0  1a3d1fa7-bc89-40a9-a3b8-c1e9392456de      32151.63      Medical   
1  17fc695a-07a0-4a6e-8822-e8f36c031199      11548.93  Home Damage   
2  b38a088c-a65e-4389-b74d-0fb132e70629      29729.38      Medical   
3  c241330b-01a9-471f-9e8a-774bcf36d58b      11322.58  Home Damage   
4  c37459ee-f50b-4a63-b71e-cd7b27cd8130      35942.97  Home Damage   

   Suspicious_Flags  Fraud_Label  
0             False            0  
1              True            0  
2              True            0  
3              True            0  
4             False            0  


# **4) Insurance Multilingual Policy Document Dataset**

# **Description**: Collection of insurance policies in multiple languages for NLP-based translation and summarization.

In [None]:
# create columns for

# ● Policy_ID
# ● Policy_Text (English, Spanish, French, Hindi, etc.)
# ● Summarized_Text (Generated using NLP models)

In [None]:
import pandas as pd
import random
from faker import Faker

fake = Faker()
Faker.seed(42)
random.seed(42)

def generate_data(n=1000):
    data = []
    languages = ['English', 'Spanish', 'French', 'Hindi']

    for _ in range(n):
        policy_id = fake.uuid4()
        policy_text = fake.paragraph(nb_sentences=10)
        summarized_text = fake.sentence(nb_words=20)

        data.append([
            policy_id, policy_text, summarized_text
        ])

    df = pd.DataFrame(data, columns=[
        'Policy_ID', 'Policy_Text', 'Summarized_Text'
    ])

    return df

# Generate 1000 rows of fake data
df = generate_data(1000)
print(df.head())

# Save to CSV
df.to_csv("insurance_policies_data.csv", index=False)


                              Policy_ID  \
0  bdd640fb-0667-4ad1-9c80-317fa3b1799d   
1  1c11f735-dc71-4d96-8c0f-d195c17af08a   
2  5496f63c-dc11-40c1-880a-adfbe7c99b26   
3  14822f53-8201-4c62-b5f5-9b220e8fa8e0   
4  5e84f058-d5a8-44eb-8939-23de8babce3b   

                                         Policy_Text  \
0  Development say quality throughout beautiful. ...   
1  Civil quite others his other life edge network...   
2  Draw protect Democrat car very number line. Sp...   
3  Organization push dog build. East organization...   
4  Challenge camera final together someone team t...   

                                     Summarized_Text  
0  Detail food shoulder argue start source husban...  
1  She campaign little near enter their instituti...  
2  Total clearly able hospital unit size expect r...  
3  Data plant enough major town suffer begin inte...  
4  Image street fight decision size parent focus ...  


# **5) Customer Segmentation Dataset (Unsupervised Learning - Clustering)**

# **Description**: Used for clustering customers into risk profiles.

In [None]:
# create columns for

# ● Customer_ID
# ● Demographic Data (Age, Income, Location, etc.)
# ● Number of Active Policies
# ● Total Premium Paid
# ● Claim Frequency
# ● Policy Upgrades

In [None]:
import pandas as pd
import random
from faker import Faker

fake = Faker()
Faker.seed(42)
random.seed(42)

def generate_data(n=1000):
    data = []

    for _ in range(n):
        customer_id = fake.uuid4()
        age = random.randint(18, 80)
        income = round(random.uniform(20000, 200000), 2)
        location = fake.city()
        active_policies = random.randint(1, 5)
        total_premium_paid = round(random.uniform(1000, 50000), 2)
        claim_frequency = random.randint(0, 10)
        policy_upgrades = random.randint(0, 3)

        data.append([
            customer_id, age, income, location, active_policies,
            total_premium_paid, claim_frequency, policy_upgrades
        ])

    df = pd.DataFrame(data, columns=[
        'Customer_ID', 'Age', 'Income', 'Location', 'Number_of_Active_Policies',
        'Total_Premium_Paid', 'Claim_Frequency', 'Policy_Upgrades'
    ])

    return df

# Generate 1000 rows of fake data
df = generate_data(1000)
print(df.head())

# Save to CSV
df.to_csv("customer_demographics_data.csv", index=False)


                            Customer_ID  Age     Income        Location  \
0  bdd640fb-0667-4ad1-9c80-317fa3b1799d   58   40039.59        Johnberg   
1  bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9   61  153320.19  New Roberttown   
2  815ef6d1-3b8f-4a18-b7f8-a88b17fc695a   19   36865.14   New Jamesside   
3  96da1dac-72ff-4d2a-b86e-cbe06b65a6a4   63  136979.20        Lisatown   
4  b2b9437a-28df-4ec4-8e4a-2bbdc241330b   69  176474.06    Petersonberg   

   Number_of_Active_Policies  Total_Premium_Paid  Claim_Frequency  \
0                          3            12999.70                2   
1                          5             5260.00                6   
2                          2            25762.41                0   
3                          5            21556.47                7   
4                          2            35208.83                5   

   Policy_Upgrades  
0                0  
1                0  
2                1  
3                2  
4                2  


In [None]:
# 1-Insurance-Risk-Claim-Dataset
# 2-Customer-Feedback-Sentiment-Dataset
# 3-Fraudulent-Insurance-Claims-Dataset
# 4-Insurance-Multilingual-Policy-Document-Dataset
# 5-Customer-Segmentation

In [None]:
df1 = pd.read_csv("/content/insurance_fraud_data.csv")
df2 = pd.read_csv("/content/insurance_reviews_data.csv")
df3 = pd.read_csv("/content/insurance_claims_data.csv")
df4 = pd.read_csv("/content/insurance_policies_data.csv")
df5 = pd.read_csv("/content/customer_demographics_data.csv")

In [None]:
df1.to_csv("df1-insurance_risk_claim_dataset.csv")
df2.to_csv("df2-customer_feedback_sentiment_dataset.csv")
df3.to_csv("df3-fraudulent_insurance_claims_dataset.csv")
df4.to_csv("df4-insurance_multilingual_policy_document_dataset.csv")
df5.to_csv("df5-customer_segmentation_dataset.csv")

In [None]:
df1.head() #1-Insurance-Risk-Claim-Dataset

# Description: Contains historical insurance policyholder details, claims history,and fraudulent claims.

Unnamed: 0,Policy_ID,Customer_Age,Gender,Policy_Type,Annual_Income,Vehicle_Age_Property_Age,Claim_History,Fraudulent_Claim,Premium_Amount,Claim_Amount,Risk_Score
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,58,Male,Health,153479.09,7,1,0,3814.12,33834.97,High
1,23b8c1e9-3924-46de-beb1-3b9046685257,23,Other,Property,25720.88,2,1,0,2774.1,1326.8,Low
2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,59,Other,Property,59679.31,18,2,0,3914.63,7982.97,Low
3,972a8469-1641-4f82-8b9d-2434e465e150,35,Male,Auto,192298.35,10,0,0,2209.67,17948.97,Low
4,17fc695a-07a0-4a6e-8822-e8f36c031199,34,Male,Property,116521.06,29,3,0,2984.18,41470.23,Medium


In [None]:
df2.head()  # 2-Customer-Feedback-Sentiment-Dataset

Unnamed: 0,Review_ID,Customer_ID,Review_Text,Sentiment_Label,Rating,Service_Type
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,23b8c1e9-3924-46de-beb1-3b9046685257,Beautiful instead ahead despite measure ago cu...,Neutral,1,Claim
1,12476f57-a5e5-45ab-aefc-fad8efc89849,88bd6407-2bcf-4e01-a28d-efe39bf00273,Left establish understand read. Range successf...,Neutral,3,Claim
2,cac5b68c-28f4-4481-a0a0-4dc427209bdf,10435a10-98ae-4334-ac12-ace8ae340454,Other life edge network wall quite. Race Mr en...,Positive,2,Customer Support
3,913e4de2-e0c5-4cb8-bda9-c2a90ed42f1a,bb5e4bcf-15ed-4269-9429-6c07f26b4776,Within mouth call process. Close month parent ...,Positive,5,Claim
4,dfde4fbf-3ff3-40bf-b66e-cb15474ebc19,ceda8bbb-7171-4434-934c-6c92ec5b227c,Anything yourself structure why. Coach magazin...,Neutral,4,Claim


In [None]:
df3.head() # 3-Fraudulent-Insurance-Claims-Dataset

Unnamed: 0,Claim_ID,Claim_Date,Policyholder_ID,Claim_Amount,Claim_Type,Suspicious_Flags,Fraud_Label
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,2022-01-19,1a3d1fa7-bc89-40a9-a3b8-c1e9392456de,32151.63,Medical,False,0
1,8b9d2434-e465-4150-bd9c-66b3ad3c2d6d,2021-02-09,17fc695a-07a0-4a6e-8822-e8f36c031199,11548.93,Home Damage,True,0
2,9a1de644-815e-46d1-bb8f-aa1837f8a88b,2020-10-22,b38a088c-a65e-4389-b74d-0fb132e70629,29729.38,Medical,True,0
3,72ff5d2a-386e-4be0-ab65-a6a48b8148f6,2023-08-16,c241330b-01a9-471f-9e8a-774bcf36d58b,11322.58,Home Damage,True,0
4,6c307511-b2b9-437a-a8df-6ec4ce4a2bbd,2022-05-18,c37459ee-f50b-4a63-b71e-cd7b27cd8130,35942.97,Home Damage,False,0


In [None]:
df4.head()  # 4-Insurance-Multilingual-Policy-Document-Dataset

Unnamed: 0,Policy_ID,Policy_Text,Summarized_Text
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,Development say quality throughout beautiful. ...,Detail food shoulder argue start source husban...
1,1c11f735-dc71-4d96-8c0f-d195c17af08a,Civil quite others his other life edge network...,She campaign little near enter their instituti...
2,5496f63c-dc11-40c1-880a-adfbe7c99b26,Draw protect Democrat car very number line. Sp...,Total clearly able hospital unit size expect r...
3,14822f53-8201-4c62-b5f5-9b220e8fa8e0,Organization push dog build. East organization...,Data plant enough major town suffer begin inte...
4,5e84f058-d5a8-44eb-8939-23de8babce3b,Challenge camera final together someone team t...,Image street fight decision size parent focus ...


In [None]:
df5.head()  # 5-Customer-Segmentation

Unnamed: 0,Customer_ID,Age,Income,Location,Number_of_Active_Policies,Total_Premium_Paid,Claim_Frequency,Policy_Upgrades
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,58,40039.59,Johnberg,3,12999.7,2,0
1,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,61,153320.19,New Roberttown,5,5260.0,6,0
2,815ef6d1-3b8f-4a18-b7f8-a88b17fc695a,19,36865.14,New Jamesside,2,25762.41,0,1
3,96da1dac-72ff-4d2a-b86e-cbe06b65a6a4,63,136979.2,Lisatown,5,21556.47,7,2
4,b2b9437a-28df-4ec4-8e4a-2bbdc241330b,69,176474.06,Petersonberg,2,35208.83,5,2


In [None]:
import numpy as np
import pandas as pd
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

# Parameters
n_samples = 1000

# Generate the dataset
data = {
    'Policy_ID': [fake.uuid4() for _ in range(n_samples)],
    'Customer_Age': np.random.randint(18, 80, size=n_samples),
    'Annual_Income': np.random.randint(30000, 150000, size=n_samples),
    'Claim_History': np.random.randint(0, 5, size=n_samples),
    'Premium_Amount': np.random.randint(500, 10000, size=n_samples),
    'Claim_Amount': np.random.randint(1000, 50000, size=n_samples)
}

# Simplify and strengthen target relationship
fraud_scores = (
    data['Claim_History'] * 0.5 +
    data['Claim_Amount'] * 0.00004 +
    data['Premium_Amount'] * 0.00008 +
    np.random.normal(0, 0.1, n_samples)
)

# Target: Claim prediction (1 for high likelihood of fraud, 0 otherwise)
data['Claim_Prediction'] = (fraud_scores > np.percentile(fraud_scores, 65)).astype(int)

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("simplified_insurance_dataset.csv", index=False)

# Preview the dataset
df.head()


Unnamed: 0,Policy_ID,Customer_Age,Annual_Income,Claim_History,Premium_Amount,Claim_Amount,Claim_Prediction
0,68ffae17-7428-4491-b9c8-42cba8d3110e,56,92292,0,8197,47607,0
1,99a04abb-99e9-4433-9453-e17e711ebe6f,69,53833,0,2696,34487,0
2,8c1e9464-e1d5-40dc-bea5-85a901aec505,46,34158,4,7665,48658,1
3,8ebb8208-a0ee-404f-9ea8-fcb1473268c9,32,92680,0,801,21963,0
4,0c85f154-3527-4cee-9343-bf88d1f379b1,60,50309,0,8027,21936,0
