Step 1: Data Exploration and Summarization

1. Import Libraries and Load Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [9]:
# Load the dataset
data = pd.read_excel('topcover_pilot.xlsx')

# Save as dataframe
data = pd.DataFrame(data)

# Display basic information about the dataset
print(data.info())
print(data.describe())
data.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pno          1094 non-null   int64  
 1   Amount       1094 non-null   int64  
 2   Gender       1094 non-null   object 
 3   RG           1094 non-null   int64  
 4   Cover        1094 non-null   int64  
 5   SurrVal      1094 non-null   int64  
 6   Offer/APE    1094 non-null   float64
 7   PercCover    1094 non-null   float64
 8   CurrAge      1094 non-null   float64
 9   Fee_claimed  1094 non-null   int64  
 10  Take-up ind  1094 non-null   int64  
 11  Mngt_act     983 non-null    object 
dtypes: float64(3), int64(7), object(2)
memory usage: 102.7+ KB
None
               Pno        Amount           RG         Cover        SurrVal  \
count  1094.000000  1.094000e+03  1094.000000  1.094000e+03    1094.000000   
mean    547.500000  6.758929e+04     7.054845  6.292455e+05    7054.129799   
st

Unnamed: 0,Pno,Amount,Gender,RG,Cover,SurrVal,Offer/APE,PercCover,CurrAge,Fee_claimed,Take-up ind,Mngt_act
0,1,6444,M,4,48024,0,1.5,0.134183,71.25,0,0,BCD
1,2,9649,F,7,1000000,0,1.3,0.009649,60.916667,0,0,PIN
2,3,8688,M,8,800001,2250,1.1,0.01086,63.416667,0,0,PIN
3,4,7483,F,4,21713,0,4.7,0.344632,82.916667,0,0,BCD
4,5,8115,M,8,526919,0,1.3,0.015401,60.0,0,0,BCD


2. Data Prep function

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def prepare_data(data):
    data = data.copy()
    
    # 1. Handle Missing Values
    imputer = SimpleImputer(strategy='mean')
    data.iloc[:, :] = imputer.fit_transform(data)

    # 2. Remove Duplicates
    data = data.drop_duplicates()

    # 3. Correct Data Entry Errors (Assuming we know the errors, here we don't correct anything specifically)
    # Example: data['column'] = data['column'].replace('error_value', 'correct_value')

    # 4. Remove Outliers (Simple example, usually needs more sophisticated methods)
    # Example: data = data[data['column'] < data['column'].quantile(0.99)]

    # 5. Feature Engineering
    data['Surrval_Cover'] = data['SurrVal'] / data['Cover']
    data['Surrval_Amount'] = data['Amount'] / data['SurrVal']
    data['PercCover'] = data['Amount'] / data['Cover']
    
    # Identifying the columns again since we added new features
    categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Remove the target column from the features
    target = 'Take_up_ind'
    if target in numerical_features:
        numerical_features.remove(target)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )

    # 6. Splitting the Data into Training and Testing Sets
    X = data.drop(columns=[target])
    y = data[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 7. Applying the Transformations
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    return X_train, X_test, y_train, y_test, preprocessor

# Example Usage
# Assuming 'df_data_split' is your DataFrame and it includes a 'Take_up_ind' column as the target variable
# X_train, X_test, y_train, y_test, preprocessor = prepare_data(df_data_split)
