importing dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df = pd.read_excel('../../data/Donn_es_Assurance_S2.1.xlsx',sheet_name='personne_physique')

Initial exploration

In [3]:
print('Dataset shape:',df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nFirst 5 rows:\n", df.head())
print("\nBasic Info:")
df.info()

Dataset shape: (43314, 12)

Column Names: ['REF_PERSONNE', 'NOM_PRENOM', 'DATE_NAISSANCE', 'LIEU_NAISSANCE', 'CODE_SEXE', 'SITUATION_FAMILIALE', 'NUM_PIECE_IDENTITE', 'LIB_SECTEUR_ACTIVITE', 'LIB_PROFESSION', 'VILLE', 'LIB_GOUVERNORAT', 'VILLE_GOUVERNORAT']

Data Types:
 REF_PERSONNE             int64
NOM_PRENOM              object
DATE_NAISSANCE          object
LIEU_NAISSANCE          object
CODE_SEXE               object
SITUATION_FAMILIALE     object
NUM_PIECE_IDENTITE       int64
LIB_SECTEUR_ACTIVITE    object
LIB_PROFESSION          object
VILLE                   object
LIB_GOUVERNORAT         object
VILLE_GOUVERNORAT       object
dtype: object

First 5 rows:
    REF_PERSONNE      NOM_PRENOM           DATE_NAISSANCE LIEU_NAISSANCE  \
0           715  Personne_00001  1964-09-28 00:00:00.000            NaN   
1          1381  Personne_00002  1962-02-27 00:00:00.000          TUNIS   
2          1947  Personne_00003  1970-04-17 00:00:00.000          TUNIS   
3          2832  Personne_

Data quality assessment

In [4]:
def assess_data_quality(df):
    """Comprehensive data quality assessment"""
    
    # Missing values analysis
    missing_stats = pd.DataFrame({
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
        'Unique_Values': df.nunique()
    })
    
    print("Missing Values Analysis:")
    print(missing_stats)
    
    # Data type validation
    print("\nData Type Validation:")
    for col in df.columns:
        print(f"{col}: {df[col].dtype}")
    
    return missing_stats


quality_report = assess_data_quality(df)

Missing Values Analysis:
                      Missing_Count  Missing_Percentage  Unique_Values
REF_PERSONNE                      0            0.000000          43314
NOM_PRENOM                        0            0.000000          43314
DATE_NAISSANCE                  937            2.163273          18384
LIEU_NAISSANCE                 5653           13.051207           5463
CODE_SEXE                         0            0.000000              3
SITUATION_FAMILIALE               0            0.000000              5
NUM_PIECE_IDENTITE                0            0.000000          43314
LIB_SECTEUR_ACTIVITE              1            0.002309             31
LIB_PROFESSION                 2802            6.469040            690
VILLE                         14697           33.931292           3362
LIB_GOUVERNORAT               27923           64.466454             24
VILLE_GOUVERNORAT                 0            0.000000           4054

Data Type Validation:
REF_PERSONNE: int64
NOM_PRENO

Column-specific Cleaning

REF_PERSONNE Column

In [5]:
# checking for duplicates and validity
print('REF_PERSONNE unique values:',df['REF_PERSONNE'].nunique())
print('REF_PERSONNE duplicates:',df['REF_PERSONNE'].duplicated().sum())

REF_PERSONNE unique values: 43314
REF_PERSONNE duplicates: 0


Nom_Prenom column

In [6]:
# Check name patterns and anomalies
print("Name column analysis:")
print("Unique names:", df['NOM_PRENOM'].nunique())
print("Sample names:", df['NOM_PRENOM'].head(10).tolist())

# Check for special characters or anomalies
x = df['NOM_PRENOM'].str.len()
print("Name length stats:",x.describe())

Name column analysis:
Unique names: 43314
Sample names: ['Personne_00001', 'Personne_00002', 'Personne_00003', 'Personne_00004', 'Personne_00005', 'Personne_00006', 'Personne_00007', 'Personne_00008', 'Personne_00009', 'Personne_00010']
Name length stats: count    43314.0
mean        14.0
std          0.0
min         14.0
25%         14.0
50%         14.0
75%         14.0
max         14.0
Name: NOM_PRENOM, dtype: float64


In [7]:
df.columns

Index(['REF_PERSONNE', 'NOM_PRENOM', 'DATE_NAISSANCE', 'LIEU_NAISSANCE',
       'CODE_SEXE', 'SITUATION_FAMILIALE', 'NUM_PIECE_IDENTITE',
       'LIB_SECTEUR_ACTIVITE', 'LIB_PROFESSION', 'VILLE', 'LIB_GOUVERNORAT',
       'VILLE_GOUVERNORAT'],
      dtype='object')

Date_Naissance Column

In [8]:
temp = df.copy()

In [None]:
def clean_age_data(df, date_col='DATE_NAISSANCE', min_age=18, max_age=100):

    temp = df.copy()
    
    # Convert to datetime and handle invalid dates
    temp[date_col] = pd.to_datetime(temp[date_col], errors='coerce')
    
    # Calculate age
    current_date = pd.Timestamp.now()
    temp['AGE'] = (current_date - temp[date_col]).dt.days // 365
    
    # Identify records with unrealistic ages
    mask_invalid_dates = temp[date_col].isnull()
    mask_too_young = temp['AGE'] < min_age
    mask_too_old = temp['AGE'] > max_age
    mask_future_dates = temp[date_col] > current_date
    mask_negative_age = temp['AGE'] < 0
    
    # Combine all invalid age conditions
    invalid_age_mask = (mask_invalid_dates | mask_too_young | mask_too_old | 
                       mask_future_dates | mask_negative_age)
    
    # Remove rows with unrealistic ages
    original_count = len(temp)
    temp = temp[~invalid_age_mask].copy()
    removed_count = original_count - len(temp)
    
    return temp, removed_count

In [10]:
df,removed_count = clean_age_data(df)

In [11]:
df.shape

(42065, 13)

LIEU_NAISSANCE Column

In [12]:
# Analyze birthplace data
print("Unique birthplaces:", df['LIEU_NAISSANCE'].nunique())
print("Missing birthplaces:", df['LIEU_NAISSANCE'].isnull().sum())

# Check for standardization issues
birthplace_counts = df['LIEU_NAISSANCE'].value_counts()
print("Top 10 birthplaces:\n", birthplace_counts.head(10))

Unique birthplaces: 5451
Missing birthplaces: 4735
Top 10 birthplaces:
 LIEU_NAISSANCE
TUNIS       7067
GABES       2239
SFAX        1612
BIZERTE     1329
SOUSSE      1191
BEJA        1130
KAIROUAN     632
MONASTIR     565
JENDOUBA     502
NABEUL       373
Name: count, dtype: int64


In [13]:
# Correct way: Direct assignment
df['LIEU_NAISSANCE'] = df['LIEU_NAISSANCE'].fillna('TUNIS')

# Verify the change
print("Missing birthplaces after fill:", df['LIEU_NAISSANCE'].isnull().sum())
print("New value count for 'TUNIS':", (df['LIEU_NAISSANCE'] == 'TUNIS').sum())

Missing birthplaces after fill: 0
New value count for 'TUNIS': 11802


Checking for empty string values in birthplace

In [14]:
# Count the number of empty strings
empty_count = (df['LIEU_NAISSANCE'] == '').sum()
print(f"Number of empty strings: {empty_count}")

# You can also see the actual rows with empty strings
empty_birthplaces = df[df['LIEU_NAISSANCE'] == '']
print(f"Rows with empty birthplace:\n{empty_birthplaces}")

Number of empty strings: 0
Rows with empty birthplace:
Empty DataFrame
Columns: [REF_PERSONNE, NOM_PRENOM, DATE_NAISSANCE, LIEU_NAISSANCE, CODE_SEXE, SITUATION_FAMILIALE, NUM_PIECE_IDENTITE, LIB_SECTEUR_ACTIVITE, LIB_PROFESSION, VILLE, LIB_GOUVERNORAT, VILLE_GOUVERNORAT, AGE]
Index: []


CODE_SEXE Column

In [15]:
# Validate gender codes
print("Gender code distribution:")
print(df['CODE_SEXE'].value_counts())

# Check for invalid codes
valid_genders = ['M', 'F']
invalid_genders = df[~df['CODE_SEXE'].isin(valid_genders)]
print(f"Invalid gender codes: {len(invalid_genders)}")

Gender code distribution:
CODE_SEXE
M    29294
F    12081
       690
Name: count, dtype: int64
Invalid gender codes: 690


Handling empty string values in CODE_SEXE

In [16]:
# Replace empty strings with 'Unknown'
df['CODE_SEXE'] = df['CODE_SEXE'].replace('', 'Unknown')

# Verify the change
print("Gender code distribution after cleaning:")
print(df['CODE_SEXE'].value_counts())

# Check for invalid codes again (now only truly invalid entries, if any)
valid_genders = ['M', 'F', 'Unknown'] # Add 'Unknown' to the valid list for checking
invalid_genders = df[~df['CODE_SEXE'].isin(valid_genders)]
print(f"Invalid gender codes: {len(invalid_genders)}")

Gender code distribution after cleaning:
CODE_SEXE
M    29294
F    12081
       690
Name: count, dtype: int64
Invalid gender codes: 690


SITUATION_FAMILIALE COLUMN

In [17]:
# Analyze marital status
print("Marital status distribution:")
print(df['SITUATION_FAMILIALE'].value_counts())

Marital status distribution:
SITUATION_FAMILIALE
Marie          30193
Celibataire     9509
                1518
Veuf(ve)         508
Divorce          337
Name: count, dtype: int64


Handling empty string values in SITUATION_FAMILIALE

In [18]:
# 1. Check initial state and confirm whitespace entries
print("Initial Marital Status Distribution:")
print(df['SITUATION_FAMILIALE'].value_counts())
whitespace_count = (df['SITUATION_FAMILIALE'].str.strip() == '').sum()
print(f"Initial whitespace-only entries: {whitespace_count}\n")

# 2. Replace whitespace-only strings with 'Unknown'
mask = df['SITUATION_FAMILIALE'].str.strip() == ''
df.loc[mask, 'SITUATION_FAMILIALE'] = 'Unknown'

# 3. Verify the cleanup was successful
print("Final Marital Status Distribution:")
final_counts = df['SITUATION_FAMILIALE'].value_counts()
print(final_counts)
print(f"\nTotal 'Unknown' entries: {final_counts.get('Unknown', 0)}")

# 4. Final validation check
valid_statuses = ['Marie', 'Celibataire', 'Veuf(ve)', 'Divorce', 'Unknown']
is_valid = df['SITUATION_FAMILIALE'].isin(valid_statuses)
print(f"Rows with valid marital status: {is_valid.sum()}/{len(df)}")
print(f"Rows with invalid marital status: {(~is_valid).sum()}")

Initial Marital Status Distribution:
SITUATION_FAMILIALE
Marie          30193
Celibataire     9509
                1518
Veuf(ve)         508
Divorce          337
Name: count, dtype: int64
Initial whitespace-only entries: 1518

Final Marital Status Distribution:
SITUATION_FAMILIALE
Marie          30193
Celibataire     9509
Unknown         1518
Veuf(ve)         508
Divorce          337
Name: count, dtype: int64

Total 'Unknown' entries: 1518
Rows with valid marital status: 42065/42065
Rows with invalid marital status: 0


NUM_PIECE_IDENTITE Column

In [19]:
# Validate ID numbers
print("ID number analysis:")
print("Unique IDs:", df['NUM_PIECE_IDENTITE'].nunique())
print("Missing IDs:", df['NUM_PIECE_IDENTITE'].isnull().sum())

# Check ID format consistency
df['id_length'] = df['NUM_PIECE_IDENTITE'].astype(str).str.len()
print("ID length distribution:\n", df['id_length'].value_counts())

ID number analysis:
Unique IDs: 42065
Missing IDs: 0
ID length distribution:
 id_length
8    42065
Name: count, dtype: int64


Profession Column

In [20]:
# Analyze professional information
professional_cols = ['LIB_SECTEUR_ACTIVITE', 'LIB_PROFESSION']

for col in professional_cols:
    print(f"\n{col} Analysis:")
    print("Missing values:", df[col].isnull().sum())
    print("Unique values:", df[col].nunique())
    print("Top 10 values:\n", df[col].value_counts().head(10))


LIB_SECTEUR_ACTIVITE Analysis:
Missing values: 0
Unique values: 31
Top 10 values:
 LIB_SECTEUR_ACTIVITE
STATION  DE SERVICE                      13655
                                          7465
COMMERCIAL                                5346
AGRICULTURE, CHASSE, SERVICES ANNEXES     3633
EMPLOYÉS                                  3348
AUCUN                                     2522
OUVRIERS                                   923
INDUSTRIE                                  914
RETRAITÉS                                  724
ÉDUCATION                                  719
Name: count, dtype: int64

LIB_PROFESSION Analysis:
Missing values: 2801
Unique values: 680
Top 10 values:
 LIB_PROFESSION
COMMERCIAL                              5418
NON FOURNI                              4601
NON DEFINIE                             2520
AGRICULTEUR                             2365
GERANT                                  2307
RETRAITE                                2267
OUVRIER                         

Unique values in LIB_SECTEUR_ACTIVITE

In [21]:
df['LIB_SECTEUR_ACTIVITE'].nunique()

31

Unique valyes in LIB_PROFESSION

In [22]:
df['LIB_PROFESSION'].nunique()

680

Cleaning LIB_SECTEUR_ACTIVITE

In [23]:
# First, clean the missing values represented as whitespace
print("Initial analysis for LIB_SECTEUR_ACTIVITE:")
print(f"Missing (NaN): {df['LIB_SECTEUR_ACTIVITE'].isnull().sum()}")
print(f"Whitespace-only: {(df['LIB_SECTEUR_ACTIVITE'].str.strip() == '').sum()}")

# Replace whitespace-only strings with a label
mask = df['LIB_SECTEUR_ACTIVITE'].str.strip() == ''
df.loc[mask, 'LIB_SECTEUR_ACTIVITE'] = 'NON_RENSEIGNE'  

# Verify cleaning
print("\nAfter cleaning whitespace:")
print(df['LIB_SECTEUR_ACTIVITE'].value_counts().head())

Initial analysis for LIB_SECTEUR_ACTIVITE:
Missing (NaN): 0
Whitespace-only: 7465

After cleaning whitespace:
LIB_SECTEUR_ACTIVITE
STATION  DE SERVICE                      13655
NON_RENSEIGNE                             7465
COMMERCIAL                                5346
AGRICULTURE, CHASSE, SERVICES ANNEXES     3633
EMPLOYÉS                                  3348
Name: count, dtype: int64


Grouping LIB_SECTEUR_ACTIVITE into meaningful categories

In [24]:
secteur_list = df['LIB_SECTEUR_ACTIVITE'].unique().tolist()
secteur_list

['CADRES ET PROFESSIONS INTELLECTUELLES SUPÉRIEURES',
 'SERVICES PERSONNELS',
 'COMMERCIAL',
 'AUCUN',
 'NON_RENSEIGNE',
 'OUVRIERS',
 'STATION  DE SERVICE',
 'EMPLOYÉS',
 'AGRICULTURE, CHASSE, SERVICES ANNEXES',
 'RETRAITÉS',
 'ARTISANS, COMMERÇANTS ET CHEFS D ENTREPRISE',
 'PROFESSIONS INTERMÉDIAIRES',
 'AUTRES PERSONNES SANS ACTIVITÉ PROFESSIONNELLE',
 'ACTIVITES IARD TARIFIABLES',
 'COMMERCE DE GROS ET INTERMÉDIAIRES DU COMMERCE',
 'INGENIEUR',
 'COMMERCE ET RÉPARATION AUTOMOBILE',
 'ACTIVITE SPORTIVE',
 'COMMERCE DE DÉTAIL ET RÉPARATION D ARTICLES DOMESTIQUES',
 'ADMINISTRATION PUBLIQUE',
 'ASSURANCE',
 'EXTRACTION DE MINERAIS MÉTALLIQUES',
 'RECHERCHE ET DÉVELOPPEMENT',
 'SANTÉ ET ACTION SOCIALE',
 'INDUSTRIE ',
 'POSTES ET TÉLÉCOMMUNICATIONS',
 'ÉDUCATION',
 'PÊCHE, AQUACULTURE',
 'COKÉFACTION, RAFFINAGE, INDUSTRIES NUCLÉAIRES',
 'TRANSPORTS AÉRIENS',
 'INDUSTRIES ALIMENTAIRES']

In [25]:
sector_mapping = {
    # High-income professionals (low physical risk, high asset protection needs)
    'CADRES ET PROFESSIONS INTELLECTUELLES SUPÉRIEURES': 'CADRES_SUPERIEURS',
    'INGENIEUR': 'CADRES_SUPERIEURS',
    'RECHERCHE ET DÉVELOPPEMENT': 'CADRES_SUPERIEURS',
    
    # Commercial and business activities
    'COMMERCIAL': 'COMMERCE_ET_VENTE',
    'COMMERCE DE GROS ET INTERMÉDIAIRES DU COMMERCE': 'COMMERCE_ET_VENTE',
    'COMMERCE ET RÉPARATION AUTOMOBILE': 'COMMERCE_ET_VENTE',
    'COMMERCE DE DÉTAIL ET RÉPARATION D ARTICLES DOMESTIQUES': 'COMMERCE_ET_VENTE',
    'ARTISANS, COMMERÇANTS ET CHEFS D ENTREPRISE': 'COMMERCE_ET_VENTE',
    
    # Services sector (varied risk profiles)
    'SERVICES PERSONNELS': 'SERVICES',
    'STATION DE SERVICE': 'SERVICES',
    'ACTIVITE SPORTIVE': 'SERVICES',
    'SERVICES': 'SERVICES',
    
    # Industrial and manual labor (higher physical risk)
    'OUVRIERS': 'INDUSTRIE_ET_CONSTRUCTION',
    'INDUSTRIE': 'INDUSTRIE_ET_CONSTRUCTION',
    'INDUSTRIES ALIMENTAIRES': 'INDUSTRIE_ET_CONSTRUCTION',
    'EXTRACTION DE MINERAIS MÉTALLIQUES': 'INDUSTRIE_ET_CONSTRUCTION',
    'COKÉFACTION, RAFFINAGE, INDUSTRIES NUCLÉAIRES': 'INDUSTRIE_ET_CONSTRUCTION',
    
    # Agriculture and fishing (specialized risks)
    'AGRICULTURE, CHASSE, SERVICES ANNEXES': 'AGRICULTURE_ET_PECHE',
    'PÊCHE, AQUACULTURE': 'AGRICULTURE_ET_PECHE',
    
    # Public sector and administration (stable, low risk)
    'ADMINISTRATION PUBLIQUE': 'ADMINISTRATION_PUBLIQUE',
    'POSTES ET TÉLÉCOMMUNICATIONS': 'ADMINISTRATION_PUBLIQUE',
    'EMPLOYÉS': 'ADMINISTRATION_PUBLIQUE',  # Assuming most employees are in public sector
    
    # Education and healthcare (professional services)
    'ÉDUCATION': 'EDUCATION_ET_SANTE',
    'SANTÉ ET ACTION SOCIALE': 'EDUCATION_ET_SANTE',
    'PROFESSIONS INTERMÉDIAIRES': 'EDUCATION_ET_SANTE',  # Often includes teachers, nurses, etc.
    
    # Insurance and finance (professional services)
    'ASSURANCE': 'FINANCE_ET_ASSURANCE',
    'ACTIVITES IARD TARIFIABLES': 'FINANCE_ET_ASSURANCE',  # IARD = Incendie, Accidents, Risques Divers
    
    # Transportation (higher risk)
    'TRANSPORTS AÉRIENS': 'TRANSPORTS',
    'TRANSPORTS': 'TRANSPORTS',
    
    # Special categories
    'RETRAITÉS': 'RETRAITES',
    'AUCUN': 'SANS_EMPLOI',
    'AUTRES PERSONNES SANS ACTIVITÉ PROFESSIONNELLE': 'SANS_EMPLOI',
    'NON_RENSEIGNE': 'NON_RENSEIGNE'
}

# Apply the mapping
df['SECTEUR_ACTIVITE_GROUP'] = df['LIB_SECTEUR_ACTIVITE'].map(sector_mapping)

# For any values not in our mapping, assign to 'AUTRES_SECTEURS'
df['SECTEUR_ACTIVITE_GROUP'] = df['SECTEUR_ACTIVITE_GROUP'].fillna('AUTRES_SECTEURS')

# Verify the grouping
print("Distribution of sector groups:")
print(df['SECTEUR_ACTIVITE_GROUP'].value_counts())

print(f"\nNumber of unique sector groups: {df['SECTEUR_ACTIVITE_GROUP'].nunique()}")

Distribution of sector groups:
SECTEUR_ACTIVITE_GROUP
AUTRES_SECTEURS              14569
NON_RENSEIGNE                 7465
COMMERCE_ET_VENTE             5883
AGRICULTURE_ET_PECHE          3644
ADMINISTRATION_PUBLIQUE       3639
SANS_EMPLOI                   2859
EDUCATION_ET_SANTE            1144
INDUSTRIE_ET_CONSTRUCTION      947
RETRAITES                      724
CADRES_SUPERIEURS              666
SERVICES                       333
FINANCE_ET_ASSURANCE           151
TRANSPORTS                      41
Name: count, dtype: int64

Number of unique sector groups: 13


LIB_PROFESSION 

In [26]:
profession_list = df['LIB_PROFESSION'].unique().tolist()
profession_list

['MÉDECINS LIBÉRAUX GÉNÉRALISTES',
 'CHAUFFEUR',
 'COMMERCIAL',
 'NON DEFINIE',
 'ETUDIANTE',
 'AUTRES OUVRIERS QUALIFIÉS DES TRAVAUX PUBLICS',
 'OUVRIERS QUALIFIÉS D ENTRETIEN GÉNÉRAL DES BÂTIMENTS',
 'INSPECTEUR',
 'AGENTS ADMINISTRATIFS DE LA FONCTION PUBLIQUE (Y.C. ENSEIGNEMENT)',
 'CONDUCTEURS DE VÉHICULE ROUTIER DE TRANSPORT EN COMMUN (SALARIÉS)',
 'AGRICULTEURS SUR PETITE EXPLOITATION DE CÉRÉALES-GRANDES CULTURES',
 'AGRICULTEURS SUR GRANDE EXPLOITATION DE CÉRÉALES-GRANDES CULTURES',
 'AGRICULTEURS SUR MOYENNE EXPLOITATION SANS ORIENTATION DOMINANTE',
 'EMPLOYÉS DES SERVICES COMMERCIAUX DE LA BANQUE',
 'AGRICULTEURS SUR PETITE EXPLOITATION SANS ORIENTATION DOMINANTE',
 'EMPLOYÉS ADMINISTRATIFS QUALIFIÉS DES AUTRES SERVICES DES ENTREPRISES',
 'EMPLOYÉS QUALIFIÉS DES SERVICES COMMERCIAUX DES ENTREPRISES (HORS VENTE)',
 'ANCIENS EMPLOYÉS',
 'ARTISANS DIVERS DE FABRICATION DE MACHINES',
 'AUTRES AGENTS ET OUVRIERS QUALIFIÉS (SÉDENTAIRES) DES SERVICES D EXPLOITATION DES TRANSPORTS',


In [27]:
df2 = df.copy()

In [28]:
# First, handle missing/unknown values
df['LIB_PROFESSION'] = df['LIB_PROFESSION'].fillna('NON_RENSEIGNE')
mask = df['LIB_PROFESSION'].str.strip() == ''
df.loc[mask, 'LIB_PROFESSION'] = 'NON_RENSEIGNE'

# Define comprehensive profession grouping function
def group_profession(profession):
    if pd.isna(profession):
        return 'NON_RENSEIGNE'
    
    prof_str = str(profession).upper()
    
    # Handle non-specific categories first
    if any(word in prof_str for word in ['NON FOURNI', 'NON DEFINIE', 'NON_RENSEIGNE', 'NON RENSEIGNE']):
        return 'NON_RENSEIGNE'
    if 'RETRAITE' in prof_str or 'ANCIEN' in prof_str:
        return 'RETRAITES'
    if any(word in prof_str for word in ['ETUDIANT', 'ELEVE', 'ECOLIER', 'STAGIAIRE']):
        return 'ETUDIANTS'
    if any(word in prof_str for word in ['CHOMAGE', 'SANS EMPLOI', 'AUCUN', 'AUCUNE ACTIVITE']):
        return 'SANS_EMPLOI'
    
    # High-income professionals (low risk, high asset value)
    if any(word in prof_str for word in ['CADRE', 'INGENIEUR', 'ARCHITECTE', 'EXPERT', 'CONSULTANT', 
                                       'DIRECTEUR', 'MANAGER', 'CHEF SERVICE', 'RESPONSABLE', 'BANQUIER',
                                       'AVOCAT', 'MEDECIN', 'PHARMACIEN', 'DENTISTE', 'VETERINAIRE',
                                       'JOURNALISTE', 'PROFESSEUR', 'ENSEIGNANT', 'CHERCHEUR', 'MAITRE']):
        return 'CADRES_SUPERIEURS'
    
    # Commercial and sales professions
    if any(word in prof_str for word in ['COMMERCIAL', 'VENDEUR', 'GERANT', 'NEGOCIANT', 'REPRESENTANT',
                                       'DELEGUE', 'COURTIER', 'AGENT COMMERCIAL', 'CONSEILLER COMMERCIAL']):
        return 'COMMERCE_ET_VENTE'
    
    # Technical and engineering professions
    if any(word in prof_str for word in ['TECHNICIEN', 'ELECTRICIEN', 'MECANICIEN', 'PLOMBIER', 'CHAUFFAGISTE',
                                       'SOUDEUR', 'MONTEUR', 'OPERATEUR', 'CONTRÔLEUR', 'MAINTENANCE']):
        return 'TECHNICIENS_ET_ARTISANS'
    
    # Administrative and office workers
    if any(word in prof_str for word in ['ADMINISTRATIF', 'SECRETAIRE', 'COMPTABLE', 'GESTIONNAIRE', 'AGENT',
                                       'EMPLOYE', 'ASSISTANT', 'CAISSIER', 'STANDARDISTE', 'DACTYLO']):
        return 'ADMINISTRATION_ET_BUREAU'
    
    # Healthcare professions
    if any(word in prof_str for word in ['INFIRMIER', 'AIDE SOIGNANT', 'KINESITHERAPEUTE', 'SAGE FEMME',
                                       'MEDICAL', 'PHARMACEUTIQUE', 'BIOLOGISTE', 'RADIOLOGUE']):
        return 'SANTE_ET_MEDICAL'
    
    # Security and defense
    if any(word in prof_str for word in ['POLICE', 'GENDARME', 'MILITAIRE', 'SECURITE', 'SURVEILLANT',
                                       'GARDIEN', 'DOUANE', 'POMPIER', 'AGENT DE SECURITE']):
        return 'SECURITE_ET_DEFENSE'
    
    # Transportation professionals (higher risk)
    if any(word in prof_str for word in ['CHAUFFEUR', 'CONDUCTEUR', 'PILOTE', 'TAXISTE', 'ROUTIER',
                                       'TRANSPORT', 'LIVREUR', 'AMBULANCIER', 'MARIN']):
        return 'TRANSPORTS'
    
    # Education and social services
    if any(word in prof_str for word in ['EDUCATEUR', 'FORMATEUR', 'ANIMATEUR', 'ASSISTANT SOCIAL', 
                                       'MONITEUR', 'PROFESSEUR', 'ENSEIGNANT', 'INSTITUTEUR']):
        return 'EDUCATION_ET_SOCIAL'
    
    # Construction and manual labor (higher physical risk)
    if any(word in prof_str for word in ['OUVRIER', 'MACON', 'MENUISIER', 'PEINTRE', 'CARRELEUR',
                                       'PLATRIER', 'CHARBENTIER', 'CONSTRUCT', 'BATIMENT', 'CHANTIER']):
        return 'BATIMENT_ET_TRAVAUX'
    
    # Agriculture and fishing
    if any(word in prof_str for word in ['AGRICULTEUR', 'ELEVEUR', 'PECHEUR', 'VITICULTEUR', 'ARBORICULTEUR',
                                       'JARDINIER', 'FORESTIER', 'HORTICULTEUR']):
        return 'AGRICULTURE_ET_PECHE'
    
    # Hospitality and services
    if any(word in prof_str for word in ['HOTESSE', 'SERVEUR', 'CUISINIER', 'RESTAURATION', 'HOTELLERIE',
                                       'COIFFEUR', 'ESTHETICIEN', 'MANUCURE', 'TOURISME']):
        return 'HOTELLERIE_RESTAURATION'
    
    # Arts and entertainment
    if any(word in prof_str for word in ['ARTISTE', 'ACTEUR', 'MUSICIEN', 'PHOTOGRAPHE', 'DESSINATEUR',
                                       'REALISATEUR', 'DECORATEUR', 'STYLISTE']):
        return 'ARTS_ET_SPECTACLE'
    
    # Industrial workers
    if any(word in prof_str for word in ['INDUSTRIE', 'USINE', 'PRODUCTION', 'MANUTENTION', 'MACHINISTE',
                                       'FABRICATION', 'ASSEMBLAGE']):
        return 'INDUSTRIE_ET_PRODUCTION'
    
    return 'AUTRES_PROFESSIONS'

# Apply the grouping
df['PROFESSION_GROUP'] = df['LIB_PROFESSION'].apply(group_profession)

# Verify the results
print("Profession groups distribution:")
print(df['PROFESSION_GROUP'].value_counts())
print(f"\nNumber of unique profession groups: {df['PROFESSION_GROUP'].nunique()}")

# Show sample mapping for verification
sample_professions = ['INGENIEUR', 'COMMERCIAL', 'OUVRIER', 'INFIRMIER', 'CHAUFFEUR', 'PROFESSEUR']
for prof in sample_professions:
    print(f"{prof} -> {group_profession(prof)}")

Profession groups distribution:
PROFESSION_GROUP
NON_RENSEIGNE               9922
COMMERCE_ET_VENTE           7838
ADMINISTRATION_ET_BUREAU    4473
AUTRES_PROFESSIONS          4185
AGRICULTURE_ET_PECHE        3468
CADRES_SUPERIEURS           3121
RETRAITES                   3041
SECURITE_ET_DEFENSE         1408
BATIMENT_ET_TRAVAUX         1363
ETUDIANTS                    771
TECHNICIENS_ET_ARTISANS      734
ARTS_ET_SPECTACLE            630
TRANSPORTS                   552
SANTE_ET_MEDICAL             301
EDUCATION_ET_SOCIAL          120
INDUSTRIE_ET_PRODUCTION       71
HOTELLERIE_RESTAURATION       65
SANS_EMPLOI                    2
Name: count, dtype: int64

Number of unique profession groups: 18
INGENIEUR -> CADRES_SUPERIEURS
COMMERCIAL -> COMMERCE_ET_VENTE
OUVRIER -> BATIMENT_ET_TRAVAUX
INFIRMIER -> SANTE_ET_MEDICAL
CHAUFFEUR -> TRANSPORTS
PROFESSEUR -> CADRES_SUPERIEURS


In [29]:
def assess_data_quality(df):
    """Comprehensive data quality assessment"""
    
    # Missing values analysis
    missing_stats = pd.DataFrame({
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
        'Unique_Values': df.nunique()
    })
    
    print("Missing Values Analysis:")
    print(missing_stats)
    
    # Data type validation
    print("\nData Type Validation:")
    for col in df.columns:
        print(f"{col}: {df[col].dtype}")
    
    return missing_stats


quality_report = assess_data_quality(df2)

Missing Values Analysis:
                        Missing_Count  Missing_Percentage  Unique_Values
REF_PERSONNE                        0            0.000000          42065
NOM_PRENOM                          0            0.000000          42065
DATE_NAISSANCE                      0            0.000000          18100
LIEU_NAISSANCE                      0            0.000000           5451
CODE_SEXE                           0            0.000000              3
SITUATION_FAMILIALE                 0            0.000000              5
NUM_PIECE_IDENTITE                  0            0.000000          42065
LIB_SECTEUR_ACTIVITE                0            0.000000             31
LIB_PROFESSION                   2801            6.658742            680
VILLE                           14347           34.106740           3308
LIB_GOUVERNORAT                 26769           63.637228             24
VILLE_GOUVERNORAT                   0            0.000000           3979
AGE                       

Geographic Columns

In [30]:
# Analyze location data
geo_cols = ['VILLE', 'LIB_GOUVERNORAT', 'VILLE_GOUVERNORAT']

for col in geo_cols:
    print(f"\n{col} Analysis:")
    print("Missing values:", df[col].isnull().sum())
    print("Unique values:", df[col].nunique())
    
    if df[col].nunique() < 20:  # Show all if few unique values
        print("Value counts:\n", df[col].value_counts())
    else:
        print("Top 10 values:\n", df[col].value_counts().head(10))


VILLE Analysis:
Missing values: 14347
Unique values: 3308
Top 10 values:
 VILLE
GABES          1704
SFAX           1220
TUNIS          1179
BIZERTE         887
SOUSSE          673
ARIANA          646
MONASTIR        549
BEJA            540
BEN AROUS       514
SIDI BOUZID     498
Name: count, dtype: int64

LIB_GOUVERNORAT Analysis:
Missing values: 26769
Unique values: 24
Top 10 values:
 LIB_GOUVERNORAT
TUNIS          1911
BEN AROUS      1162
BIZERTE        1035
GABES          1035
SFAX           1011
SILIANA         919
ARIANA          912
SOUSSE          746
NABEUL          734
SIDI BOUZID     727
Name: count, dtype: int64

VILLE_GOUVERNORAT Analysis:
Missing values: 0
Unique values: 3979
Top 10 values:
 VILLE_GOUVERNORAT
 -                           14300
GABES -                       1646
SFAX - SFAX                    720
TUNIS - TUNIS                  653
BIZERTE - BIZERTE              635
TUNIS -                        508
SFAX -                         500
  -                   

In [31]:
# Clean geographic columns
geo_cols = ["VILLE", 'LIB_GOUVERNORAT', 'VILLE_GOUVERNORAT']

for col in geo_cols:
    print(f"\n--- Cleaning {col} ---")
    
    # Check initial state
    print(f"Initial missing values: {df[col].isnull().sum()}")
    whitespace_count = (df[col].str.strip() == '').sum()
    print(f"Initial whitespace-only values: {whitespace_count}")
    
    # Replace whitespace-only strings with NaN first (to handle them together)
    mask_whitespace = df[col].str.strip() == ''
    df.loc[mask_whitespace, col] = np.nan
    
    # Now fill all missing values (both original NaN and converted whitespace) with 'UNKNOWN'
    df[col] = df[col].fillna('UNKNOWN')
    
    # Verify cleaning
    print(f"Final missing values: {df[col].isnull().sum()}")
    print(f"Final whitespace-only values: {(df[col].str.strip() == '').sum()}")
    
    # Show updated value counts
    if df[col].nunique() < 20:
        print(f"Value counts after cleaning:\n{df[col].value_counts()}")
    else:
        print(f"Top 10 values after cleaning:\n{df[col].value_counts().head(10)}")

# Final verification
print("\n=== FINAL SUMMARY ===")
for col in geo_cols:
    missing_final = df[col].isnull().sum()
    unknown_count = (df[col] == 'UNKNOWN').sum()
    print(f"{col}: {missing_final} missing, {unknown_count} marked as 'UNKNOWN'")


--- Cleaning VILLE ---
Initial missing values: 14347
Initial whitespace-only values: 468
Final missing values: 0
Final whitespace-only values: 0
Top 10 values after cleaning:
VILLE
UNKNOWN      14815
GABES         1704
SFAX          1220
TUNIS         1179
BIZERTE        887
SOUSSE         673
ARIANA         646
MONASTIR       549
BEJA           540
BEN AROUS      514
Name: count, dtype: int64

--- Cleaning LIB_GOUVERNORAT ---
Initial missing values: 26769
Initial whitespace-only values: 0
Final missing values: 0
Final whitespace-only values: 0
Top 10 values after cleaning:
LIB_GOUVERNORAT
UNKNOWN      26769
TUNIS         1911
BEN AROUS     1162
BIZERTE       1035
GABES         1035
SFAX          1011
SILIANA        919
ARIANA         912
SOUSSE         746
NABEUL         734
Name: count, dtype: int64

--- Cleaning VILLE_GOUVERNORAT ---
Initial missing values: 0
Initial whitespace-only values: 0
Final missing values: 0
Final whitespace-only values: 0
Top 10 values after cleaning:
VILL

In [32]:
quality_report = assess_data_quality(df)

Missing Values Analysis:
                        Missing_Count  Missing_Percentage  Unique_Values
REF_PERSONNE                        0                 0.0          42065
NOM_PRENOM                          0                 0.0          42065
DATE_NAISSANCE                      0                 0.0          18100
LIEU_NAISSANCE                      0                 0.0           5451
CODE_SEXE                           0                 0.0              3
SITUATION_FAMILIALE                 0                 0.0              5
NUM_PIECE_IDENTITE                  0                 0.0          42065
LIB_SECTEUR_ACTIVITE                0                 0.0             31
LIB_PROFESSION                      0                 0.0            681
VILLE                               0                 0.0           3308
LIB_GOUVERNORAT                     0                 0.0             25
VILLE_GOUVERNORAT                   0                 0.0           3979
AGE                       

In [33]:
df.columns

Index(['REF_PERSONNE', 'NOM_PRENOM', 'DATE_NAISSANCE', 'LIEU_NAISSANCE',
       'CODE_SEXE', 'SITUATION_FAMILIALE', 'NUM_PIECE_IDENTITE',
       'LIB_SECTEUR_ACTIVITE', 'LIB_PROFESSION', 'VILLE', 'LIB_GOUVERNORAT',
       'VILLE_GOUVERNORAT', 'AGE', 'id_length', 'SECTEUR_ACTIVITE_GROUP',
       'PROFESSION_GROUP'],
      dtype='object')

Exporting the data

In [37]:
import pickle

In [40]:
path = '../../data'
df.to_pickle('../../data/clients_phy.pkl')