In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_excel('../../data/Donn_es_Assurance_S2.1.xlsx',sheet_name='personne_morale')

dataframe shape

In [3]:
df.shape

(747, 8)

checking for duplicates

In [4]:
df.duplicated().sum()

np.int64(0)

checking for missing values

In [5]:
df.isna().sum()

REF_PERSONNE              0
RAISON_SOCIALE            0
MATRICULE_FISCALE         0
LIB_SECTEUR_ACTIVITE      0
LIB_ACTIVITE              8
VILLE                   427
LIB_GOUVERNORAT         481
VILLE_GOUVERNORAT         0
dtype: int64

Missing values in VILLE and LIB_GOUVERNORAT generally does not affect our recommendation system.

In [6]:
df.nunique()

REF_PERSONNE            747
RAISON_SOCIALE          747
MATRICULE_FISCALE       747
LIB_SECTEUR_ACTIVITE     61
LIB_ACTIVITE            251
VILLE                   159
LIB_GOUVERNORAT          19
VILLE_GOUVERNORAT       167
dtype: int64

* REF_PERSONNE, RAISON_SOCIALE, MATRICULE_FISCALE can be considered as unique identifiers
* we have 61 unique LIB_SECTEUR_ACTIVITE and 251 LIB_ACTIVITE (we need to check if some categories are well encoded)


Columns dtypes

In [7]:
df.dtypes

REF_PERSONNE             int64
RAISON_SOCIALE          object
MATRICULE_FISCALE       object
LIB_SECTEUR_ACTIVITE    object
LIB_ACTIVITE            object
VILLE                   object
LIB_GOUVERNORAT         object
VILLE_GOUVERNORAT       object
dtype: object

All columns are strings except REF_PERSONNE which is numeric

Columns Analysis:
* personne_morale: represents legal entities or companies
* REF_PERSONNE: id/reference code for each company.
* RAISON_SOCIALE: the official registered name of the company (This is the legal name under which the company operates)
* MATRICULE_FISCALE: tax identification number or fiscal registration number assigned to the company by the tax authority. (used for tax reporting and official financial transactions)
* LIB_SECTEUR_ACTIVITE: the sector of activity in which the company operates
* LIB_ACTIVITE: a more detailed description of the company's specific activity or business.
* VILLE: The city where the company is located or registered
* LIB_GOUVERNORAT: The governorate where the company is located
* VILLE_GOUVERNORAT: This represents the city within the governorate.

We can deal with the 8 missing values in LIB_ACTIVITE with LIB_SECTEUR_ACTIVITE to preserve meaningful information.

In [8]:
df[df['LIB_ACTIVITE'].isnull()]

Unnamed: 0,REF_PERSONNE,RAISON_SOCIALE,MATRICULE_FISCALE,LIB_SECTEUR_ACTIVITE,LIB_ACTIVITE,VILLE,LIB_GOUVERNORAT,VILLE_GOUVERNORAT
89,98018,Societe_000090,0000090G,INDUSTRIE,,,,-
150,395189,Societe_000151,0000151R,AUCUN,,,,-
213,550886,Societe_000214,0000214K,ACTIVITE SPORTIVE,,,,-
222,553109,Societe_000223,0000223B,ACTIVITE SPORTIVE,,,,-
225,557042,Societe_000226,0000226I,ÉDUCATION,,,,-
227,557111,Societe_000228,0000228X,ACTIVITE SPORTIVE,,,,-
489,557149,Societe_000490,0000490Y,AUCUN,,Haffouz,ARIANA,Haffouz - ARIANA
679,557052,Societe_000680,0000680U,POSTES ET TÉLÉCOMMUNICATIONS,,Berge Du Lac,TUNIS,Berge Du Lac - TUNIS


In [9]:
df['LIB_ACTIVITE'].fillna(df['LIB_SECTEUR_ACTIVITE'],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LIB_ACTIVITE'].fillna(df['LIB_SECTEUR_ACTIVITE'],inplace=True)


In [10]:
df.nunique()

REF_PERSONNE            747
RAISON_SOCIALE          747
MATRICULE_FISCALE       747
LIB_SECTEUR_ACTIVITE     61
LIB_ACTIVITE            256
VILLE                   159
LIB_GOUVERNORAT          19
VILLE_GOUVERNORAT       167
dtype: int64

EDA for activity/sector columns

Questions:
* How many unique sectors and activities exist?
* Are these sectors with very few activities(or vice versa)?

In [11]:
print('Unique Sectors:',df['LIB_SECTEUR_ACTIVITE'].nunique())
print('Unique Actvities:',df['LIB_ACTIVITE'].nunique())

Unique Sectors: 61
Unique Actvities: 256


In [12]:
# Frequency distribution
print('\nTop 10 sectors:')
print(df['LIB_SECTEUR_ACTIVITE'].value_counts().head(10))

print('\nTop 10 Activities:')
print(df['LIB_ACTIVITE'].value_counts().head(10))


Top 10 sectors:
LIB_SECTEUR_ACTIVITE
AUCUN                                              93
COMMERCE DE GROS ET INTERMÉDIAIRES DU COMMERCE     51
ACTIVITES IARD TARIFIABLES                         50
SERVICES FOURNIS PRINCIPALEMENT AUX ENTREPRISES    41
CONSTRUCTION                                       38
ACTIVITÉS INFORMATIQUES                            29
ACTIVITÉS IMMOBILIÈRES                             29
HÔTELS ET RESTAURANTS                              29
AGRICULTURE, CHASSE, SERVICES ANNEXES              28
SANTÉ ET ACTION SOCIALE                            25
Name: count, dtype: int64

Top 10 Activities:
LIB_ACTIVITE
NON DEFINI                                      91
PROMOTION IMMOBILIÈRE DE LOGEMENTS              24
AUXILIAIRES D ASSURANCE                         16
CULTURE DE CÉRÉALES ; CULTURES INDUSTRIELLES    12
AGENCES ET BUREAUX                              12
BANQUES                                         12
CONSEIL POUR LES AFFAIRES ET LA GESTION         12
AUTRES

As we can see, we have many companies whose Sector and activities are not defined, since we don't know any information about them we can't recommend them anything, they will be filtered.

In [13]:
filtered_df = df[
    (df['LIB_SECTEUR_ACTIVITE']!='AUCUN') &
    (df['LIB_ACTIVITE']!='NON DEFINI')
]
print('original shape',df.shape)
print('Filtered shape:',filtered_df.shape)

original shape (747, 8)
Filtered shape: (654, 8)


In [14]:
# Frequency distribution of filtered_df
print('\nTop 10 sectors:')
print(filtered_df['LIB_SECTEUR_ACTIVITE'].value_counts().head(10))

print('\nTop 10 Activities:')
print(filtered_df['LIB_ACTIVITE'].value_counts().head(10))


Top 10 sectors:
LIB_SECTEUR_ACTIVITE
COMMERCE DE GROS ET INTERMÉDIAIRES DU COMMERCE     51
ACTIVITES IARD TARIFIABLES                         50
SERVICES FOURNIS PRINCIPALEMENT AUX ENTREPRISES    41
CONSTRUCTION                                       38
ACTIVITÉS INFORMATIQUES                            29
ACTIVITÉS IMMOBILIÈRES                             29
HÔTELS ET RESTAURANTS                              29
AGRICULTURE, CHASSE, SERVICES ANNEXES              28
SANTÉ ET ACTION SOCIALE                            25
TRANSPORTS TERRESTRES                              23
Name: count, dtype: int64

Top 10 Activities:
LIB_ACTIVITE
PROMOTION IMMOBILIÈRE DE LOGEMENTS              24
AUXILIAIRES D ASSURANCE                         16
AGENCES ET BUREAUX                              12
BANQUES                                         12
CONSEIL POUR LES AFFAIRES ET LA GESTION         12
CULTURE DE CÉRÉALES ; CULTURES INDUSTRIELLES    12
AUTRES ACTIVITÉS SPORTIVES                      11
CONSTR

After filtering these rows, we can see that we don't have any undefined values in both columns, which means that rows which had undefined activities also had undefined sectors.

But we have some rows with missing Secteur and activite (decoded as empty string) we need to deal with them.We can replace them with activities

In [15]:
empty_mask = (
    filtered_df['LIB_SECTEUR_ACTIVITE'].str.strip().eq('') 
)
print(f"Number of empty string decoded sectors: {empty_mask.sum()}")
print('Sample rows with empty sectors:')
print(filtered_df[empty_mask])

Number of empty string decoded sectors: 10
Sample rows with empty sectors:
     REF_PERSONNE  RAISON_SOCIALE MATRICULE_FISCALE LIB_SECTEUR_ACTIVITE  \
108        250773  Societe_000109          0000109J                        
142        372953  Societe_000143          0000143Y                        
144        373499  Societe_000145          0000145O                        
146        373506  Societe_000147          0000147S                        
166        423216  Societe_000167          0000167J                        
189        447730  Societe_000190          0000190M                        
581        505957  Societe_000582          0000582Q                        
674        498213  Societe_000675          0000675V                        
675        500075  Societe_000676          0000676M                        
710        435481  Societe_000711          0000711Q                        

              LIB_ACTIVITE          VILLE LIB_GOUVERNORAT  \
108  SERVICE INFORMATIQUE  

In [16]:
filtered_df['LIB_SECTEUR_ACTIVITE'] = filtered_df.apply(
    lambda row: row['LIB_ACTIVITE'] if empty_mask[row.name] and pd.notna(row['LIB_ACTIVITE']) else row['LIB_SECTEUR_ACTIVITE'],
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['LIB_SECTEUR_ACTIVITE'] = filtered_df.apply(


we also need to apply the empty mask to LIB_ACTIVITE if there are empty string decoded values.

In [17]:
empty_mask_2 = (
    filtered_df['LIB_ACTIVITE'].str.strip().eq('') 
)
print(f"Number of empty string decoded sectors: {empty_mask_2.sum()}")
print('Sample rows with empty sectors:')
print(filtered_df[empty_mask_2])

Number of empty string decoded sectors: 0
Sample rows with empty sectors:
Empty DataFrame
Columns: [REF_PERSONNE, RAISON_SOCIALE, MATRICULE_FISCALE, LIB_SECTEUR_ACTIVITE, LIB_ACTIVITE, VILLE, LIB_GOUVERNORAT, VILLE_GOUVERNORAT]
Index: []


We can now map activities to sector, to check for any illogical mapping.

In [18]:
sector_activity_map = filtered_df.groupby('LIB_SECTEUR_ACTIVITE')['LIB_ACTIVITE'].unique().reset_index()
for _, row in sector_activity_map.head().iterrows():
    print(f"Sector:'{row['LIB_SECTEUR_ACTIVITE']}")
    print(f"Activities:{', '.join(row['LIB_ACTIVITE'])}\n")

Sector:'ACTIVITE SPORTIVE
Activities:ACTIVITE SPORTIVE

Sector:'ACTIVITES IARD TARIFIABLES
Activities:QUINCAILLERIES (COMMERCE), ELECTRICITE (COMMERCE D ARTICLES D ECLAIRAGE), HYGIENE (ETABLISSEMENT DE BAIN DOUCHE HAMMAM SAUNA), IMPRIMERIE, TYPOGRAPHIES ET OFFSET, AGENCE ( DE VOYAGE,ASSURANCE,IMMOBILIERE), ECOLES, COLLEGES, CENTRE DE FORMATION, SALLE DE SPECTACLE OU DE FETE, BOULANGERIE AVEC FOURS AU FUEL (SANS FABRICATION DE BISCUITS ET DE GATEAUX SECS), ELECTRICIENS (REPARATION ET VENTE DE FOURNITURES POUR INSTALLATIONS ELECTRIQUE A L EXEPTION DU MATERIEL TRAITE SOUS LA RUBRIQUE -RADIO TV- ), ABBATOIRS ET DEPENDANCES(AVEC CHAMBRE FRIGORIFIQUE D UNE CAPACITE < 120 M3), VERRERIE, VAISSELLES (MAGASIN DE VENTE), PHARMACIENS (VENTE EN DETAIL), BATIMENTS : ENTREPRENEURS DE MACONNERIE ET DE CONSTRUCTION, ALIMENTATION GENERALE (COMMERCE), LINGE DE MAISON (MAGASIN DE VENTE), MATIERES PLASTIQUES  (COMMERCE D ARTICLES EN), MEDECIN, RESTAURANT, SELF SERVICE ET CANTINE, OPTIQUE ACOUSTIQUE AVEC VE

In [19]:
filtered_df.isna().sum()

REF_PERSONNE              0
RAISON_SOCIALE            0
MATRICULE_FISCALE         0
LIB_SECTEUR_ACTIVITE      0
LIB_ACTIVITE              0
VILLE                   373
LIB_GOUVERNORAT         424
VILLE_GOUVERNORAT         0
dtype: int64

In [20]:
filtered_df.to_pickle('../../data/clients_morales.pkl')

In [21]:
filtered_df.columns

Index(['REF_PERSONNE', 'RAISON_SOCIALE', 'MATRICULE_FISCALE',
       'LIB_SECTEUR_ACTIVITE', 'LIB_ACTIVITE', 'VILLE', 'LIB_GOUVERNORAT',
       'VILLE_GOUVERNORAT'],
      dtype='object')

In [24]:
filtered_df['LIB_SECTEUR_ACTIVITE'].nunique()

63

In [26]:
filtered_df['LIB_ACTIVITE'].unique().tolist()

['ACTIVITÉS THERMALES ET DE THALASSOTHÉRAPIE',
 'FABRICATION D EMBALLAGES EN BOIS',
 'CRÉDIT BAIL',
 'INSTALLATION D EAU ET DE GAZ',
 'BISCOTTERIE, BISCUITERIE, PÂTISSERIE DE CONSERVATION',
 'BLANCHISSERIE TEINTURERIE DE DÉTAIL',
 'EXTRACTION DE CALCAIRE INDUSTRIEL, DE GYPSE ET DE CRAIE',
 'BANQUES',
 'COMMERCE DE GROS D ÉQUIPEMENTS AUTOMOBILES',
 'AUTRES SERVICES PERSONNELS',
 'PRATIQUE MÉDICALE',
 'RESTAURATION DE TYPE RAPIDE',
 'CULTURE DE CÉRÉALES ; CULTURES INDUSTRIELLES',
 'FABRICATION D AUTRES PRODUITS PHARMACEUTIQUES',
 'CONSTRUCTION DE BÂTIMENTS DIVERS',
 'AUTRES COMMERCES DE DÉTAIL EN MAGASIN NON SPÉCIALISÉ',
 'FABRICATION D AUTRES PRODUITS CHIMIQUES INORGANIQUES DE BASE',
 'FABRICATION D ALIMENTS ADAPTÉS À L ENFANT ET DIÉTÉTIQUES',
 'TRANSPORT DE VOYAGEURS PAR TAXIS',
 'AUTRES ACTIVITÉS DE TRAVAIL DES GRAINS',
 'FABRICATION DE LAIT LIQUIDE ET DE PRODUITS FRAIS',
 'FABRICATION D AUTRES VÊTEMENTS ET ACCESSOIRES',
 'HORTICULTURE ; PÉPINIÈRES',
 'ÉLEVAGE D OVINS, CAPRINS ET ÉQUI

Grouping Lib_secteur_activite

In [41]:
def group_secteur_activite(secteur):
    """Group sectors into meaningful categories for insurance recommendations"""
    if pd.isna(secteur):
        return 'AUTRES_SECTEURS'
    
    secteur = str(secteur).upper()
    
    # Primary sectors with high physical risk
    if any(word in secteur for word in ['CONSTRUCTION', 'BATIMENT', 'TRAVAIL', 'INDUSTRIE', 'USINE', 'METALLURGIE', 
                                      'EXTRACTION', 'MINERAIS', 'CHIMIQUE', 'CAOUTCHOUC', 'PLASTIQUES', 'TEXTILE']):
        return 'INDUSTRIE_ET_CONSTRUCTION'
    
    # Transportation and logistics
    if any(word in secteur for word in ['TRANSPORT', 'LOGISTIQUE', 'PORTUAIRE', 'AÉRIEN', 'ROUTIER', 'FERROVIAIRE']):
        return 'TRANSPORTS_ET_LOGISTIQUE'
    
    # Commerce and retail
    if any(word in secteur for word in ['COMMERCE', 'VENTE', 'DÉTAIL', 'GROS', 'MAGASIN', 'SUPERMARCHÉ']):
        return 'COMMERCE_ET_VENTE'
    
    # Services sector
    if any(word in secteur for word in ['SERVICES', 'SERVICE', 'CONSEIL', 'CONSULTING', 'INFORMATIQUE', 'INFORMATIQUES',
                                      'IMMOBILIER', 'FINANCIER', 'ASSURANCE', 'BANQUE', 'COMPTABLE', 'JURIDIQUE']):
        return 'SERVICES_ET_CONSEIL'
    
    # Healthcare and social services
    if any(word in secteur for word in ['SANTÉ', 'MÉDICAL', 'HÔPITAL', 'CLINIQUE', 'PHARMACEUTIQUE', 'SOCIAL', 'ACTION SOCIALE']):
        return 'SANTÉ_ET_SOCIAL'
    
    # Education and research
    if any(word in secteur for word in ['ÉDUCATION', 'ENSEIGNEMENT', 'FORMATION', 'RECHERCHE', 'DÉVELOPPEMENT', 'UNIVERSITÉ']):
        return 'EDUCATION_ET_RECHERCHE'
    
    # Agriculture and natural resources
    if any(word in secteur for word in ['AGRICULTURE', 'PÊCHE', 'AQUACULTURE', 'FORESTIER', 'SYLVICULTURE', 'ÉLEVAGE', 'CHASSE']):
        return 'AGRICULTURE_ET_RESSOURCES'
    
    # Hospitality and tourism
    if any(word in secteur for word in ['HÔTEL', 'RESTAURANT', 'TOURISME', 'HÔTELLERIE', 'RESTAURATION', 'VOYAGE']):
        return 'HOTELLERIE_ET_TOURISME'
    
    # Public sector and administration
    if any(word in secteur for word in ['ADMINISTRATION', 'PUBLIQUE', 'DÉFENSE', 'SÉCURITÉ', 'PUBLIC', 'GOUVERNEMENT']):
        return 'ADMINISTRATION_PUBLIQUE'
    
    # Energy and utilities
    if any(word in secteur for word in ['ÉLECTRICITÉ', 'GAZ', 'ÉNERGIE', 'EAU', 'ASSAINISSEMENT', 'UTILITIES']):
        return 'ENERGIE_ET_UTILITIES'
    
    return 'AUTRES_SECTEURS'

# Apply the grouping
filtered_df['SECTEUR_GROUP'] = filtered_df['LIB_SECTEUR_ACTIVITE'].apply(group_secteur_activite)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['SECTEUR_GROUP'] = filtered_df['LIB_SECTEUR_ACTIVITE'].apply(group_secteur_activite)


In [42]:
filtered_df['SECTEUR_GROUP'].nunique()

11

Grouping LIB_Activite

In [43]:
def group_activite(activite):
    """Group activities into meaningful categories"""
    if pd.isna(activite):
        return 'AUTRES_ACTIVITES'
    
    activite = str(activite).upper()
    
    # Manufacturing and production
    if any(word in activite for word in ['FABRICATION', 'PRODUCTION', 'USINE', 'INDUSTRIE', 'MANUFACTURIER', 
                                       'ASSEMBLAGE', 'TRANSFORMATION', 'CONDITIONNEMENT']):
        return 'PRODUCTION_ET_FABRICATION'
    
    # Construction and building
    if any(word in activite for word in ['CONSTRUCTION', 'BÂTIMENT', 'MAÇONNERIE', 'TERRASSEMENT', 'IMMOBILIER', 
                                       'PROMOTION', 'TRAVAUX', 'CHANTIER']):
        return 'CONSTRUCTION_ET_BTP'
    
    # Commerce and sales
    if any(word in activite for word in ['COMMERCE', 'VENTE', 'MAGASIN', 'DÉTAIL', 'GROS', 'SUPERMARCHÉ', 
                                       'ALIMENTATION', 'QUINCAILLERIE', 'DISTRIBUTION']):
        return 'COMMERCE_ET_DISTRIBUTION'
    
    # Services and consulting
    if any(word in activite for word in ['SERVICE', 'CONSEIL', 'CONSULTING', 'ÉTUDE', 'INGÉNIERIE', 'FORMATION',
                                       'COMPTABLE', 'JURIDIQUE', 'CONSEIL', 'ASSISTANCE']):
        return 'SERVICES_ET_CONSEIL'
    
    # Transportation and logistics
    if any(word in activite for word in ['TRANSPORT', 'LOGISTIQUE', 'LIVRAISON', 'MANUTENTION', 'PORTUAIRE',
                                       'AÉRIEN', 'ROUTIER', 'MARCHANDISES']):
        return 'TRANSPORTS_ET_LOGISTIQUE'
    
    # Healthcare and medical
    if any(word in activite for word in ['MÉDICAL', 'SANTÉ', 'HÔPITAL', 'CLINIQUE', 'PHARMACEUTIQUE', 'LABORATOIRE',
                                       'SOINS', 'THÉRAPIE', 'MÉDECIN', 'INFIRMIER']):
        return 'SANTÉ_ET_MÉDICAL'
    
    # Finance and insurance
    if any(word in activite for word in ['FINANCIER', 'BANQUE', 'ASSURANCE', 'CRÉDIT', 'INVESTISSEMENT', 
                                       'RÉASSURANCE', 'CAPITALISATION']):
        return 'FINANCE_ET_ASSURANCE'
    
    # Hospitality and food services
    if any(word in activite for word in ['HÔTEL', 'RESTAURANT', 'RESTAURATION', 'CAFÉ', 'BAR', 'HÔTELLERIE',
                                       'GASTRONOMIE', 'CUISINE', 'BOULANGERIE', 'PÂTISSERIE']):
        return 'HOTELLERIE_ET_RESTAURATION'
    
    # Information technology
    if any(word in activite for word in ['INFORMATIQUE', 'LOGICIEL', 'TÉLÉCOMMUNICATION', 'INTERNET', 'DONNÉES',
                                       'SYSTÈME', 'RÉSEAU', 'PROGRAMMATION']):
        return 'TECHNOLOGIE_ET_INFORMATIQUE'
    
    # Education and training
    if any(word in activite for word in ['ÉDUCATION', 'ENSEIGNEMENT', 'FORMATION', 'ÉCOLE', 'UNIVERSITÉ', 
                                       'APPRENTISSAGE', 'PÉDAGOGIE']):
        return 'EDUCATION_ET_FORMATION'
    
    # Agriculture and food production
    if any(word in activite for word in ['AGRICULTURE', 'ÉLEVAGE', 'CULTURE', 'PÊCHE', 'AQUACULTURE', 'FORESTIER',
                                       'MARAÎCHAGE', 'HORTICULTURE', 'VITICULTURE']):
        return 'AGRICULTURE_ET_AGROALIMENTAIRE'
    
    # Public services and administration
    if any(word in activite for word in ['ADMINISTRATION', 'PUBLIQUE', 'GOUVERNEMENT', 'DÉFENSE', 'SÉCURITÉ',
                                       'POLICE', 'DOUANE', 'FISCAL']):
        return 'ADMINISTRATION_PUBLIQUE'
    
    return 'AUTRES_ACTIVITES'

# Apply the grouping
filtered_df['ACTIVITE_GROUP'] = filtered_df['LIB_ACTIVITE'].apply(group_activite)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['ACTIVITE_GROUP'] = filtered_df['LIB_ACTIVITE'].apply(group_activite)


In [44]:
filtered_df['ACTIVITE_GROUP'].nunique()

13

In [45]:
filtered_df.columns

Index(['REF_PERSONNE', 'RAISON_SOCIALE', 'MATRICULE_FISCALE',
       'LIB_SECTEUR_ACTIVITE', 'LIB_ACTIVITE', 'VILLE', 'LIB_GOUVERNORAT',
       'VILLE_GOUVERNORAT', 'SECTEUR_GROUP', 'ACTIVITE_GROUP'],
      dtype='object')

Business Client Scoring System

In [None]:
import numpy as np

def calculate_business_client_scores(df_contrats, df_personne_morale):
    
    # First, group the business sectors and activities
    df_personne_morale = create_business_groups(df_personne_morale)
    
    # Calculate contract-based metrics (same as for individuals)
    client_metrics = df_contrats.groupby('REF_PERSONNE').agg(
        total_contracts=('NUM_CONTRAT', 'count'),
        active_contracts=('LIB_ETAT_CONTRAT', lambda x: (x == 'EN COURS').sum()),
        product_variety=('LIB_PRODUIT', 'nunique'),
        branch_variety=('branche', 'nunique'),
        total_premiums_paid=('somme_quittances', 'sum'),
        avg_premium_per_contract=('somme_quittances', 'mean'),
        total_capital_assured=('Capital_assure', 'sum'),
        paid_ratio=('statut_paiement', lambda x: (x == 'Payé').mean()),
        total_paid_contracts=('statut_paiement', lambda x: (x == 'Payé').sum()),
        canceled_contracts=('LIB_ETAT_CONTRAT', lambda x: (x == 'RESILIE').sum()),
    ).reset_index()

    # Calculate component scores
    client_metrics['loyalty_score'] = (
        (client_metrics['total_contracts'] / client_metrics['total_contracts'].max() * 30) +
        (client_metrics['product_variety'] / client_metrics['product_variety'].max() * 25) +
        (client_metrics['branch_variety'] / client_metrics['branch_variety'].max() * 20) +
        (client_metrics['active_contracts'] / client_metrics['total_contracts'].clip(lower=1) * 25)
    )
    
    client_metrics['financial_score'] = (
        (client_metrics['total_premiums_paid'] / client_metrics['total_premiums_paid'].max() * 45) +
        (client_metrics['avg_premium_per_contract'] / client_metrics['avg_premium_per_contract'].max() * 30) +
        (client_metrics['total_capital_assured'] / client_metrics['total_capital_assured'].max() * 25) 
    )
    
    client_metrics['payment_score'] = (
        (client_metrics['paid_ratio'] * 40) +
        ((1 - client_metrics['canceled_contracts'] / client_metrics['total_contracts'].clip(lower=1)) * 30) +
        (client_metrics['total_paid_contracts'] / client_metrics['total_contracts'].clip(lower=1) * 30)
    )
    
    # Normalize scores
    for score_col in ['loyalty_score', 'financial_score', 'payment_score']:
        client_metrics[score_col] = (
            (client_metrics[score_col] - client_metrics[score_col].min()) / 
            (client_metrics[score_col].max() - client_metrics[score_col].min()) * 100
        ).fillna(0)
    
    # BUSINESS-SPECIFIC SCORING ADJUSTMENTS
    # Merge with business profile data
    df_scored_business = pd.merge(client_metrics, df_personne_morale, on='REF_PERSONNE', how='left')
    
    # Adjust scores based on business characteristics
    def adjust_business_scores(row):
        base_score = (
            row['loyalty_score'] * 0.35 +
            row['financial_score'] * 0.40 +
            row['payment_score'] * 0.25
        )
        
        # Business size adjustment (using capital as proxy)
        size_adjustment = 1.0
        if row['total_capital_assured'] > 1000000:  # Large business
            size_adjustment = 1.2
        elif row['total_capital_assured'] > 100000:  # Medium business
            size_adjustment = 1.1
        
        # Risk profile adjustment
        risk_adjustment = 1.0
        if row['RISK_PROFILE'] == 'HIGH_RISK':
            risk_adjustment = 0.9  # Higher risk businesses get slightly lower scores
        elif row['RISK_PROFILE'] == 'LOW_RISK':
            risk_adjustment = 1.1
        
        return base_score * size_adjustment * risk_adjustment
    
    df_scored_business['final_client_score'] = df_scored_business.apply(adjust_business_scores, axis=1).clip(0, 100)
    
    # Segment business clients
    def segment_business_clients(score):
        if score >= 85:
            return 'Enterprise'
        elif score >= 70:
            return 'Business'
        elif score >= 50:
            return 'SME'
        elif score >= 30:
            return 'Small Business'
        else:
            return 'Startup'
    
    df_scored_business['client_segment'] = df_scored_business['final_client_score'].apply(segment_business_clients)
    
    return df_scored_business

# Create business groups function (from previous implementation)
def create_business_groups(df_personne_morale):
    df = df_personne_morale.copy()
    
    # Apply grouping functions (simplified version)
    def group_secteur(secteur):
        secteur = str(secteur).upper()
        if any(word in secteur for word in ['CONSTRUCTION', 'INDUSTRIE', 'METALLURGIE', 'EXTRACTION']):
            return 'INDUSTRIE_ET_CONSTRUCTION'
        elif any(word in secteur for word in ['TRANSPORT', 'LOGISTIQUE']):
            return 'TRANSPORTS_ET_LOGISTIQUE'
        elif any(word in secteur for word in ['COMMERCE', 'VENTE']):
            return 'COMMERCE_ET_VENTE'
        elif any(word in secteur for word in ['SERVICES', 'CONSEIL', 'INFORMATIQUE']):
            return 'SERVICES_ET_CONSEIL'
        elif any(word in secteur for word in ['SANTÉ', 'MÉDICAL']):
            return 'SANTÉ_ET_SOCIAL'
        elif any(word in secteur for word in ['HÔTEL', 'RESTAURANT']):
            return 'HOTELLERIE_ET_TOURISME'
        elif any(word in secteur for word in ['AGRICULTURE', 'PÊCHE']):
            return 'AGRICULTURE_ET_RESSOURCES'
        else:
            return 'AUTRES_SECTEURS'
    
    def group_activite(activite):
        activite = str(activite).upper()
        if any(word in activite for word in ['FABRICATION', 'PRODUCTION']):
            return 'PRODUCTION_ET_FABRICATION'
        elif any(word in activite for word in ['CONSTRUCTION', 'BÂTIMENT']):
            return 'CONSTRUCTION_ET_BTP'
        elif any(word in activite for word in ['COMMERCE', 'VENTE']):
            return 'COMMERCE_ET_DISTRIBUTION'
        elif any(word in activite for word in ['SERVICE', 'CONSEIL']):
            return 'SERVICES_ET_CONSEIL'
        elif any(word in activite for word in ['TRANSPORT']):
            return 'TRANSPORTS_ET_LOGISTIQUE'
        else:
            return 'AUTRES_ACTIVITES'
    
    df['SECTEUR_GROUP'] = df['LIB_SECTEUR_ACTIVITE'].apply(group_secteur)
    df['ACTIVITE_GROUP'] = df['LIB_ACTIVITE'].apply(group_activite)
    
    # Risk profile
    def get_risk_profile(secteur_group, activite_group):
        high_risk = ['INDUSTRIE_ET_CONSTRUCTION', 'CONSTRUCTION_ET_BTP', 'TRANSPORTS_ET_LOGISTIQUE']
        if secteur_group in high_risk or activite_group in high_risk:
            return 'HIGH_RISK'
        elif secteur_group in ['COMMERCE_ET_VENTE', 'AGRICULTURE_ET_RESSOURCES']:
            return 'MEDIUM_RISK'
        else:
            return 'LOW_RISK'
    
    df['RISK_PROFILE'] = df.apply(lambda x: get_risk_profile(x['SECTEUR_GROUP'], x['ACTIVITE_GROUP']), axis=1)
    
    return df

Business recommendation function

In [None]:
def recommend_business_insurance(client_row, df_contrats, df_products):
    
    client_id = client_row['REF_PERSONNE']
    client_contracts = df_contrats[df_contrats['REF_PERSONNE'] == client_id]
    existing_products = set(client_contracts['LIB_PRODUIT'].unique())
    
    # Get client's existing categories
    client_portfolio = client_contracts.merge(df_products, on='LIB_PRODUIT', how='left')
    existing_categories = set(client_portfolio['LIB_SOUS_BRANCHE'].dropna().unique())
    
    # Calculate budget
    total_premiums = client_contracts['somme_quittances'].sum()
    estimated_budget = max(total_premiums * 1.5, 1000)  # Higher minimum for businesses
    
    # DETERMINE BUSINESS INSURANCE NEEDS
    priority_categories = []
    
    # Universal business needs
    base_business_needs = ['RESPONSABILITE CIVILE', 'INCENDIE RISQUES SIMPLE', 'VOL TOUTE CATEGORIES']
    priority_categories.extend(base_business_needs)
    
    # Risk-based needs
    risk_profile = client_row.get('RISK_PROFILE', 'MEDIUM_RISK')
    if risk_profile == 'HIGH_RISK':
        priority_categories.extend(['INDIVIDUELLE ACCIDENTS', 'TOUS RISQUES CHANTIER', 'BRIS DE MACHINES'])
    elif risk_profile == 'MEDIUM_RISK':
        priority_categories.extend(['INDIVIDUELLE ACCIDENTS', 'DEGATS DES EAUX'])
    
    # Sector-specific needs
    secteur_group = client_row.get('SECTEUR_GROUP', '')
    activite_group = client_row.get('ACTIVITE_GROUP', '')
    
    if secteur_group == 'TRANSPORTS_ET_LOGISTIQUE':
        priority_categories.extend(['TRANSPORT FACULTE TERRESTRE', 'ASSISTANCE DES VEHICULES'])
    elif secteur_group == 'SANTÉ_ET_SOCIAL':
        priority_categories.extend(['R.C MEDECIN', 'R.C PARAMEDICALE'])
    elif secteur_group == 'COMMERCE_ET_VENTE':
        priority_categories.extend(['VOL AVEC EFFRACTION DES MARCHANDISES', 'DEGATS DES EAUX'])
    elif secteur_group == 'HOTELLERIE_ET_TOURISME':
        priority_categories.extend(['MULTIRISQUE HOTELIER', 'ASSISTANCE EN VOYAGES'])
    elif secteur_group == 'INDUSTRIE_ET_CONSTRUCTION':
        priority_categories.extend(['BRIS DE MACHINES', 'TOUS RISQUES CHANTIER', 'RESPONSABILITE DECENNALE'])
    elif secteur_group == 'AGRICULTURE_ET_RESSOURCES':
        priority_categories.extend(['INCENDIE RISQUES AGRICOLES', 'INDIVIDUELLE ACCIDENTS'])
    
    # Size-based needs (using capital as proxy)
    total_capital = client_row.get('total_capital_assured', 0)
    if total_capital > 500000:  # Large business
        priority_categories.extend(['PERTES D EXPLOITATIONS APRES INCENDIE', 'MULTIRISQUES PROFESSIONNELLES'])
    
    # Remove existing categories
    priority_categories = list(set(priority_categories) - existing_categories)
    
    if not priority_categories:
        return []
    
    # SELECT PRODUCTS FROM PRIORITY CATEGORIES
    recommended_products = []
    
    # Business product priority mapping
    business_product_priority = {
        'RESPONSABILITE CIVILE': ['RC ENTREPRISE DE BATIMENT ET TRAVAUX PUBLIC', 'RC ARTISANTS ET COMMERCANTS', 
                                 'RC HOTELIERS', 'R.C PARTICULIER-CHEF DE FAMILLE- MAITRE DE MAISON'],
        'INCENDIE RISQUES SIMPLE': ['INCENDIE RISQUES SIMPLE', 'INCENDIE RISQUES SIMPLE CENTRALISE'],
        'VOL TOUTE CATEGORIES': ['VOL TOUTE CATEGORIES', 'VOL AVEC EFFRACTION DES MARCHANDISES DE TOUTE NATURE'],
        'INDIVIDUELLE ACCIDENTS': ['INDIVIDUELLE ACCIDENTS', 'INDIVIDUELLE ACCIDENTS ASSOCIE AU CONTRAT AUTO'],
        'TOUS RISQUES CHANTIER': ['TOUS RISQUES CHANTIER'],
        'BRIS DE MACHINES': ['BRIS DE MACHINES'],
        'DEGATS DES EAUX': ['DEGATS DES EAUX'],
        'TRANSPORT FACULTE TERRESTRE': ['POLICE AU VOYAGE(FACULTE TERRESTRE)', 'POLICE ABONNEMENT(FACULTE TERRESTRE)'],
        'ASSISTANCE DES VEHICULES': ['ASSISTANCE DES VEHICULES'],
        'R.C MEDECIN': ['R.C MEDECIN'],
        'R.C PARAMEDICALE': ['R.C PARAMEDICALE'],
        'MULTIRISQUE HOTELIER': ['MULTIRISQUE HOTELIER'],
        'ASSISTANCE EN VOYAGES': ['ASSISTANCES EN VOYAGES - PLAN BUSINESS', 'ASSISTANCES EN VOYAGES - PLAN GOLDEN'],
        'RESPONSABILITE DECENNALE': ['RESPONSABILITE DECENNALE'],
        'INCENDIE RISQUES AGRICOLES': ['INCENDIE RISQUES AGRICOLES'],
        'PERTES D EXPLOITATIONS APRES INCENDIE': ['PERTES D EXPLOITATION APRES INCENDIE'],
        'MULTIRISQUES PROFESSIONNELLES': ['MULTIRISQUES PROFESSIONNELLES', 'MULTIRISQUES PROFESSIONNELLES CENTRALISE']
    }
    
    # Get top 2 priority categories
    top_categories = priority_categories[:2]
    
    for category in top_categories:
        if category in business_product_priority:
            available_products = df_products[df_products['LIB_SOUS_BRANCHE'] == category]['LIB_PRODUIT'].unique()
            
            # Try to get priority products for this category
            for priority_product in business_product_priority[category]:
                if priority_product in available_products and priority_product not in existing_products:
                    recommended_products.append(priority_product)
                    break
            else:
                # If no priority product found, take the first available
                if len(available_products) > 0 and available_products[0] not in existing_products:
                    recommended_products.append(available_products[0])
    
    return recommended_products[:2]  # Return max 2 products

Complete endpoint for business clients

In [None]:
def business_recommendation_pipeline(df_contrats, df_personne_morale, df_products):

    business_client_ids = df_personne_morale['REF_PERSONNE'].unique()
    df_business_contrats = df_contrats[df_contrats['REF_PERSONNE'].isin(business_client_ids)]
    # Step 1: Calculate business client scores
    print("Calculating business client scores...")
    df_scored_business = calculate_business_client_scores(df_business_contrats, df_personne_morale)
    
    # Step 2: Generate recommendations
    print("Generating business recommendations...")
    recommendations = []
    
    for _, client_row in df_scored_business.iterrows():
        client_recommendations = recommend_business_insurance(client_row, df_contrats, df_products)
        
        recommendations.append({
            'REF_PERSONNE': client_row['REF_PERSONNE'],
            'RAISON_SOCIALE': client_row.get('RAISON_SOCIALE', ''),
            'SECTEUR_GROUP': client_row.get('SECTEUR_GROUP', ''),
            'ACTIVITE_GROUP': client_row.get('ACTIVITE_GROUP', ''),
            'RISK_PROFILE': client_row.get('RISK_PROFILE', ''),
            'recommended_products': client_recommendations,
            'recommendation_count': len(client_recommendations),
            'client_score': client_row.get('final_client_score', 0),
            'client_segment': client_row.get('client_segment', ''),
            'total_premiums_paid': client_row.get('total_premiums_paid', 0),
            'total_capital_assured': client_row.get('total_capital_assured', 0)
        })
    
    df_recommendations = pd.DataFrame(recommendations)
    
    # Step 3: Analysis and reporting
    print("\nBusiness Recommendation Summary:")
    print(f"Total businesses processed: {len(df_scored_business)}")
    print(f"Businesses with recommendations: {(df_recommendations['recommendation_count'] > 0).sum()}")
    print(f"Average business score: {df_recommendations['client_score'].mean():.2f}")
    
    print("\nBusiness Segment Distribution:")
    print(df_recommendations['client_segment'].value_counts())
    
    print("\nRisk Profile Distribution:")
    print(df_recommendations['RISK_PROFILE'].value_counts())
    
    print("\nTop Recommended Products:")
    top_products = df_recommendations.explode('recommended_products')['recommended_products'].value_counts().head(5)
    for product, count in top_products.items():
        print(f"  - {product}: {count} recommendations")
    
    return df_recommendations

In [54]:
df_contrats = pd.read_pickle('../../data/contrats.pkl')
df_personne_morale = filtered_df.copy()
df_products = pd.read_excel('../../data/Donn_es_Assurance_S2.1.xlsx',sheet_name='Mapping_Produits')

In [58]:
df_business_recommendations = business_recommendation_pipeline(df_contrats, df_personne_morale, df_products)

Calculating business client scores...
Generating business recommendations...

Business Recommendation Summary:
Total businesses processed: 437
Businesses with recommendations: 436
Average business score: 22.67

Business Segment Distribution:
client_segment
Startup           311
Small Business    112
SME                10
Enterprise          3
Business            1
Name: count, dtype: int64

Risk Profile Distribution:
RISK_PROFILE
LOW_RISK       249
HIGH_RISK      130
MEDIUM_RISK     58
Name: count, dtype: int64

Top Recommended Products:
  - RC ENTREPRISE DE BATIMENT ET TRAVAUX PUBLIC: 384 recommendations
  - INCENDIE RISQUES SIMPLE: 218 recommendations
  - POLICE AU VOYAGE(FACULTE TERRESTRE): 29 recommendations
  - TOUS RISQUES CHANTIER: 14 recommendations
  - MULTIRISQUES PROFESSIONNELLES: 12 recommendations


In [56]:
df_business_recommendations.shape

(437, 11)

In [57]:
df_business_recommendations.head()

Unnamed: 0,REF_PERSONNE,RAISON_SOCIALE,SECTEUR_GROUP,ACTIVITE_GROUP,RISK_PROFILE,recommended_products,recommendation_count,client_score,client_segment,total_premiums_paid,total_capital_assured
0,6958,Societe_000011,COMMERCE_ET_VENTE,COMMERCE_ET_DISTRIBUTION,MEDIUM_RISK,"[RC ENTREPRISE DE BATIMENT ET TRAVAUX PUBLIC, ...",2,23.770715,Startup,513.0,12000.0
1,11062,Societe_000744,AUTRES_SECTEURS,AUTRES_ACTIVITES,LOW_RISK,[RC ENTREPRISE DE BATIMENT ET TRAVAUX PUBLIC],1,27.500238,Startup,2357.175,18800.0
2,12068,Societe_000012,SERVICES_ET_CONSEIL,SERVICES_ET_CONSEIL,LOW_RISK,[RC ENTREPRISE DE BATIMENT ET TRAVAUX PUBLIC],1,43.648254,Small Business,7604.18,29500.0
3,12112,Societe_000743,AUTRES_SECTEURS,AUTRES_ACTIVITES,LOW_RISK,[PERTES D EXPLOITATION APRES INCENDIE],1,81.172363,Business,4077677.0,23939220.0
4,12122,Societe_000001,SERVICES_ET_CONSEIL,AUTRES_ACTIVITES,LOW_RISK,[RC ENTREPRISE DE BATIMENT ET TRAVAUX PUBLIC],1,0.000516,Startup,5106.297,22100.0
