In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('../../data/Donn_es_Assurance_S2.1.xlsx',sheet_name='Contrats')

Initial exploration

In [5]:
print('Dataset shape:',df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nFirst 5 rows:\n", df.head())
print("\nBasic Info:")
df.info()

Dataset shape: (92981, 11)

Column Names: ['REF_PERSONNE', 'NUM_CONTRAT', 'LIB_PRODUIT', 'EFFET_CONTRAT', 'DATE_EXPIRATION', 'PROCHAIN_TERME', 'LIB_ETAT_CONTRAT', 'branche', 'somme_quittances', 'statut_paiement', 'Capital_assure']

Data Types:
 REF_PERSONNE          int64
NUM_CONTRAT           int64
LIB_PRODUIT          object
EFFET_CONTRAT        object
DATE_EXPIRATION      object
PROCHAIN_TERME       object
LIB_ETAT_CONTRAT     object
branche              object
somme_quittances    float64
statut_paiement      object
Capital_assure      float64
dtype: object

First 5 rows:
    REF_PERSONNE    NUM_CONTRAT  \
0        122335  1996101000186   
1        115271  1996103000612   
2        115197  1996102000382   
3        122355  1996101000747   
4        115302  1996103001078   

                                         LIB_PRODUIT            EFFET_CONTRAT  \
0  TEMPORAIRE DECES A CAPITAL DECROISSANT LINEAIR...  1996-06-06 00:00:00.000   
1   ASSURANCE VIE COMPLEMENT RETRAITE - HORIZON (A

Data quality assessment

In [6]:
def assess_data_quality(df):
    """Comprehensive data quality assessment"""
    
    # Missing values analysis
    missing_stats = pd.DataFrame({
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
        'Unique_Values': df.nunique()
    })
    
    print("Missing Values Analysis:")
    print(missing_stats)
    
    # Data type validation
    print("\nData Type Validation:")
    for col in df.columns:
        print(f"{col}: {df[col].dtype}")
    
    return missing_stats


quality_report = assess_data_quality(df)

Missing Values Analysis:
                  Missing_Count  Missing_Percentage  Unique_Values
REF_PERSONNE                  0            0.000000          40864
NUM_CONTRAT                   0            0.000000          92981
LIB_PRODUIT                   0            0.000000             92
EFFET_CONTRAT                 0            0.000000           7913
DATE_EXPIRATION            7046            7.577892          12009
PROCHAIN_TERME            85971           92.460825           3624
LIB_ETAT_CONTRAT              0            0.000000              6
branche                       0            0.000000              7
somme_quittances             40            0.043020          31842
statut_paiement              40            0.043020              2
Capital_assure              106            0.114002           4467

Data Type Validation:
REF_PERSONNE: int64
NUM_CONTRAT: int64
LIB_PRODUIT: object
EFFET_CONTRAT: object
DATE_EXPIRATION: object
PROCHAIN_TERME: object
LIB_ETAT_CONTRAT: ob

somme_quittances,statut_paiement and Capital_assure

In [7]:
df2 = df.dropna(subset=['somme_quittances','statut_paiement'])

In [8]:
report = assess_data_quality(df2)

Missing Values Analysis:
                  Missing_Count  Missing_Percentage  Unique_Values
REF_PERSONNE                  0            0.000000          40856
NUM_CONTRAT                   0            0.000000          92941
LIB_PRODUIT                   0            0.000000             92
EFFET_CONTRAT                 0            0.000000           7913
DATE_EXPIRATION            7033            7.567166          12009
PROCHAIN_TERME            85933           92.459733           3623
LIB_ETAT_CONTRAT              0            0.000000              6
branche                       0            0.000000              7
somme_quittances              0            0.000000          31842
statut_paiement               0            0.000000              2
Capital_assure               92            0.098988           4465

Data Type Validation:
REF_PERSONNE: int64
NUM_CONTRAT: int64
LIB_PRODUIT: object
EFFET_CONTRAT: object
DATE_EXPIRATION: object
PROCHAIN_TERME: object
LIB_ETAT_CONTRAT: ob

In [9]:
df2 = df2.dropna(subset=['Capital_assure'])

In [10]:
report = assess_data_quality(df2)

Missing Values Analysis:
                  Missing_Count  Missing_Percentage  Unique_Values
REF_PERSONNE                  0            0.000000          40825
NUM_CONTRAT                   0            0.000000          92849
LIB_PRODUIT                   0            0.000000             91
EFFET_CONTRAT                 0            0.000000           7913
DATE_EXPIRATION            6958            7.493888          12009
PROCHAIN_TERME            85916           92.533038           3617
LIB_ETAT_CONTRAT              0            0.000000              6
branche                       0            0.000000              6
somme_quittances              0            0.000000          31781
statut_paiement               0            0.000000              2
Capital_assure                0            0.000000           4465

Data Type Validation:
REF_PERSONNE: int64
NUM_CONTRAT: int64
LIB_PRODUIT: object
EFFET_CONTRAT: object
DATE_EXPIRATION: object
PROCHAIN_TERME: object
LIB_ETAT_CONTRAT: ob

Exporting the data

In [11]:
df2.to_pickle('../../data/contrats.pkl')