<a href="https://colab.research.google.com/github/Nazneen-akram/healthcare-insurance-fraud/blob/main/initial_EDA_Fraud_Claims_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Functions to join Dataset
##**'Beneficiary'**, **'Inpatient'**, **'Outpatient'** and **'Fraud'** into **'merged.csv'**

1.   **join_inpatient_outpatient():** : Function to merge inpatient, outpatient dataframe
2.   **join_inpatient_outpatient_beneficiary()**: Function to merge merged df, beneficiary dataframe
3. **join_inpatient_outpatient_beneficiary_fraud()**: Function to merge merged df, fraud dataframe
4. **join_csv()**: Function to join csv files. Joining by key BeneID, Provider.
5. **generate_merged_data()**: mainfunction to join csv files. calls read_data and join_csv functions



In [2]:
def read_data():
    """
    function to read csv files
    parameters: None
    return: data frames fraud, beneficiary, inpatient, outpatient.
    raise FileExistsError: raises an exception when file is not found
    """
    try:
        fraud=pd.read_csv("/content/Train-1542865627584.csv")
        beneficiary=pd.read_csv("/content/Train_Beneficiarydata-1542865627584.csv")
        inpatient=pd.read_csv("/content/Train_Inpatientdata-1542865627584.csv")
        outpatient=pd.read_csv("/content/Train_Outpatientdata-1542865627584.csv")
        return fraud, beneficiary, inpatient, outpatient
    except FileExistsError as error:
        raise error

In [3]:
def join_inpatient_outpatient(inpatient, outpatient):
    """
    function to merge inpatient, outpatient dataframe
    parameters: inpatient, outpatient
    return: merged dataframe
    """
    inpatient['is_Inpatient'] = 1
    outpatient['is_Inpatient'] = 0
    inpatient_outpatient = pd.concat([inpatient,outpatient])
    return inpatient_outpatient


def join_inpatient_outpatient_beneficiary(inpatient_outpatient, beneficiary):
    """
    function to merge merged df, beneficiary dataframe
    parameters: inpatient, outpatient merged and beneficiary
    return: merged dataframe
    """
    inpatient_outpatient_beneficiary=pd.merge(inpatient_outpatient,beneficiary,
                                left_on='BeneID',right_on='BeneID',how='inner')
    return inpatient_outpatient_beneficiary


def join_inpatient_outpatient_beneficiary_fraud(inpatient_outpatient_beneficiary, fraud):
    """
    function to merge merged df, fraud dataframe
    parameters: inpatient, outpatient merged, beneficiary and fraud
    return: merged dataframe
    """
    inpatient_outpatient_beneficiary_fraud=pd.merge(fraud,inpatient_outpatient_beneficiary,
                                                    on='Provider')
    return inpatient_outpatient_beneficiary_fraud


def join_csv(fraud, beneficiary, inpatient, outpatient):
    """
    function to join csv files. Joining by key BeneID, Provider.
    parameters: fraud, beneficiary, inpatient, outpatient dataframes
    return: merged csv
    """
    # Join the files
    merged = join_inpatient_outpatient(inpatient, outpatient)
    merged = join_inpatient_outpatient_beneficiary(merged, beneficiary)
    merged = join_inpatient_outpatient_beneficiary_fraud(merged, fraud)

    # Save the merged file as a CSV
    merged.to_csv('/content/merged.csv', index=False)


def generate_merged_data():
    """
    mainfunction to join csv files. calls read_data and join_csv functions
    parameters: None
    return: None
    """
    fraud, beneficiary, inpatient, outpatient = read_data()
    join_csv(fraud, beneficiary, inpatient, outpatient)



In [6]:
generate_merged_data()

##**Pre-processing**
###Module to perform data preprocesssing. This will be first level preprocessing. For visualization
###and machine learning modeling seperate preprocessing will be required according to the requirements.

In [7]:
from datetime import datetime
import pandas as pd
from sklearn.model_selection import train_test_split

In [8]:
def read_data():
    """
    function to read csv file
    parameters: None
    return: data frames fraud, beneficiary, inpatient, outpatient.
    raise FileExistsError: raises an exception when file is not found
    """
    try:
        merged = pd.read_csv("/content/merged.csv")
    except FileExistsError as error:
        raise error
    return merged