In [1]:
#import relevant libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
#read the data
df = pd.read_csv('C:/Users/DONKAMS/Downloads/Project_STA2017/NigeriaAutoInsurance_Vehicle (1).csv')
#check the data
df.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_00N7XT1,2010-07-06,2011-07-05,Female,71,2010-07-06,2,JEEP,Black,REXTON,Badagry,Benue,Car Classic,0
1,ID_01ZY7FF,2010-08-07,2011-08-06,Male,30,2010-08-07,1,Saloon,Grey,TOYOTA,Ikeja,Lagos,Car Classic,1
2,ID_022U6TV,2010-10-28,2011-10-27,Male,40,2010-10-28,2,Saloon,Black,Honda,Abuja Municipal,Abuja-Municipal,Car Classic,0
3,ID_02DJWEN,2010-01-23,2011-01-16,Male,41,2010-01-23,1,Saloon,Silver,TOYOTA,Yaba,Benue,Car Classic,0
4,ID_03U3I31,2010-09-06,2011-09-05,Male,39,2010-09-06,2,Saloon,Red,TOYOTA,Oshodi-Isolo,Oshodi-Isolo,Car Classic,0


## This dataset seems to contain information relevant to insurance policies, with details about the insured individuals, their vehicles, policy specifics, and possibly some target variable for prediction or analysis.

### ID: Unique identifier for each policy holder.
### Policy Start Date: The date when the insurance policy begins.
### Policy End Date: The date when the insurance policy expires.
### Gender: Gender of the policy holder.
### Age: Age of the policy holder.
### First Transaction Date: Date of the policy holder's first transaction with the insurance company.
### No_Pol: Number of policies held by the individual.
### Car_Category: Category or type of the insured vehicle (e.g., SUV, sedan, truck, etc.).
### Subject_Car_Colour: Colour of the insured vehicle.
### Subject_Car_Make: Make or manufacturer of the insured vehicle (e.g., Toyota, Honda, etc.).
### LGA_Name: Local Government Area name.
### State: State of residence of the policy holder.
### ProductName: Type or name of the insurance product.
### target: Possibly a binary classification indicating whether the policy holder is a target for something specific (e.g., renewal, promotion, risk assessment, etc.).

### This appears to be a dataset related to insurance policies or perhaps automobile insurance.

In [7]:
##describe the data
df.describe(include = 'all')     #describe the data

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
count,2667,2667,2667,2667,2667.0,2667,2667.0,2667,2667,2667,2667,2667,2667,2667.0
unique,2667,344,350,6,,344,,14,34,45,202,88,9,
top,ID_00N7XT1,2010-02-26,2011-02-25,Male,,2010-02-26,,Saloon,Black,TOYOTA,Victoria Island,Lagos,Car Classic,
freq,1,30,29,1673,,30,,1507,1147,1377,332,1468,2210,
mean,,,,,45.110611,,1.499063,,,,,,,0.187477
std,,,,,18.732415,,0.805725,,,,,,,0.390367
min,,,,,-27.0,,1.0,,,,,,,0.0
25%,,,,,36.0,,1.0,,,,,,,0.0
50%,,,,,42.0,,1.0,,,,,,,0.0
75%,,,,,51.0,,2.0,,,,,,,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2667 entries, 0 to 2666
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      2667 non-null   object
 1   Policy Start Date       2667 non-null   object
 2   Policy End Date         2667 non-null   object
 3   Gender                  2667 non-null   object
 4   Age                     2667 non-null   int64 
 5   First Transaction Date  2667 non-null   object
 6   No_Pol                  2667 non-null   int64 
 7   Car_Category            2667 non-null   object
 8   Subject_Car_Colour      2667 non-null   object
 9   Subject_Car_Make        2667 non-null   object
 10  LGA_Name                2667 non-null   object
 11  State                   2667 non-null   object
 12  ProductName             2667 non-null   object
 13  target                  2667 non-null   int64 
dtypes: int64(3), object(11)
memory usage: 291.8+ KB


In [10]:
#check the shape of the data
df.shape

(2667, 14)

### 2667 entries or instances of data, and each entry contains information across 14 different attributes or characteristics of a car insurance policy

In [11]:
#check the columns of the data
df.columns

Index(['ID', 'Policy Start Date', 'Policy End Date', 'Gender', 'Age',
       'First Transaction Date', 'No_Pol', 'Car_Category',
       'Subject_Car_Colour', 'Subject_Car_Make', 'LGA_Name', 'State',
       'ProductName', 'target'],
      dtype='object')

# Data Cleaning and Preprocessing

In [13]:
#check the data types of the columns
df.dtypes

ID                        object
Policy Start Date         object
Policy End Date           object
Gender                    object
Age                        int64
First Transaction Date    object
No_Pol                     int64
Car_Category              object
Subject_Car_Colour        object
Subject_Car_Make          object
LGA_Name                  object
State                     object
ProductName               object
target                     int64
dtype: object

In [14]:
#check for missing values
df.isnull().sum()

ID                        0
Policy Start Date         0
Policy End Date           0
Gender                    0
Age                       0
First Transaction Date    0
No_Pol                    0
Car_Category              0
Subject_Car_Colour        0
Subject_Car_Make          0
LGA_Name                  0
State                     0
ProductName               0
target                    0
dtype: int64

### There is no missing value cases for the dataset

In [15]:
#check for duplicates
df.duplicated().sum()

0

#### No duplicated entry also, seems the dataset is pretty clean, but a quick check on the age column

In [16]:
#check for unique values
df.nunique()

ID                        2667
Policy Start Date          344
Policy End Date            350
Gender                       6
Age                         88
First Transaction Date     344
No_Pol                       7
Car_Category                14
Subject_Car_Colour          34
Subject_Car_Make            45
LGA_Name                   202
State                       88
ProductName                  9
target                       2
dtype: int64

In [17]:
#check the age column for extreme values
df['Age'].unique()

array([ 71,  30,  40,  41,  39,  47,  42,  37,  45,  52,  50,  44,  32,
        53, 120,  56,  26,  38,  43,  65,  46,  51,  58,  55,  34,  62,
        33,  49,  60,  67,  61,  64,  35,  28,  36,  10,  20,  54,  48,
        70,   7,  66,  63,  31,  57,  59,  19,  79,  73,   9,  12,  81,
        29,  78,  25,  -2,  89,  27,  72,  69,  84,  75,  76,   1,  68,
         6,   8,   5,  93,  13, -12,  83,  82,  80,  77,  74,  18, 320,
        17,   3,  85,   0,  22, -27,   2,   4,  24,  11], dtype=int64)

#### The age column in the dataset presents a diverse range of values, encompassing both negative figures (like -27 and -12) and exceedingly high ones (such as 320). This variety deviates significantly from the typical age range, suggesting potential anomalies or data entry errors. The inclusion of negative values and extremely high ages, which are highly improbable in real-world scenarios, indicates potential issues with data quality. Addressing these anomalies and errors will be crucial to ensure the accuracy and reliability of any subsequent analyses or modeling efforts involving this dataset's age-related information.