In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import shap

class FinancialExclusionAnalysis:
    def __init__(self, data_path):
        """
        Initialize the financial exclusion analysis pipeline
        
        Args:
            data_path (str): Path to the CSV file containing financial inclusion data
        """
        self.raw_data = pd.read_csv(data_path)
        self.processed_data = None
        
    def preprocess_data(self):
        """
        Comprehensive data preprocessing pipeline
        """
        # 1. Handle Missing Values
        # Identify numeric and categorical columns
        numeric_columns = self.raw_data.select_dtypes(include=['int64', 'float64']).columns
        categorical_columns = self.raw_data.select_dtypes(include=['object']).columns
        
        # Impute missing values
        # Numeric columns: median imputation
        numeric_imputer = SimpleImputer(strategy='median')
        self.raw_data[numeric_columns] = numeric_imputer.fit_transform(self.raw_data[numeric_columns])
        
        # Categorical columns: mode imputation
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        self.raw_data[categorical_columns] = categorical_imputer.fit_transform(self.raw_data[categorical_columns])
        
        # 2. Encode Categorical Variables
        # One-hot encoding for low-cardinality categorical variables
        categorical_encoded = pd.get_dummies(self.raw_data[categorical_columns])
        
        # Label encoding for high-cardinality categorical variables
        label_encoder = LabelEncoder()
        high_cardinality_columns = [col for col in categorical_columns if self.raw_data[col].nunique() > 10]
        for col in high_cardinality_columns:
            self.raw_data[col + '_encoded'] = label_encoder.fit_transform(self.raw_data[col])
        
        # 3. Normalization
        scaler = MinMaxScaler()
        numeric_scaled = pd.DataFrame(scaler.fit_transform(self.raw_data[numeric_columns]), 
                                      columns=numeric_columns)
        
        # Combine processed features
        self.processed_data = pd.concat([
            numeric_scaled, 
            categorical_encoded, 
            self.raw_data[[col + '_encoded' for col in high_cardinality_columns]]
        ], axis=1)
        
        # 4. Dimensionality Reduction
        pca = PCA(n_components=0.95)  # Retain 95% of variance
        self.processed_data = pd.DataFrame(
            pca.fit_transform(self.processed_data), 
            columns=[f'PC{i+1}' for i in range(pca.n_components_)]
        )
        
        # 5. Financial Engagement Score
        self.calculate_financial_engagement_score()
        
        # 6. Financial Literacy Score
        self.calculate_financial_literacy_score()
        
    def calculate_financial_engagement_score(self):
        """
        Calculate a composite financial engagement score
        """
        # Identify transaction and savings-related columns
        transaction_cols = [col for col in self.raw_data.columns if 'transaction' in col.lower()]
        savings_cols = [col for col in self.raw_data.columns if 'savings' in col.lower()]
        
        # Normalize transaction and savings metrics
        scaler = MinMaxScaler()
        normalized_transaction = scaler.fit_transform(self.raw_data[transaction_cols])
        normalized_savings = scaler.fit_transform(self.raw_data[savings_cols])
        
        # Weighted average (can adjust weights based on domain knowledge)
        transaction_weight = 0.6
        savings_weight = 0.4
        
        financial_engagement_score = (
            transaction_weight * normalized_transaction.mean(axis=1) + 
            savings_weight * normalized_savings.mean(axis=1)
        )
        
        self.processed_data['financial_engagement_score'] = financial_engagement_score
        
    def calculate_financial_literacy_score(self):
        """
        Calculate a composite financial literacy score
        """
        # Use education and income as primary indicators
        education_mapping = {
            'Primary': 1,
            'Secondary': 2,
            'Tertiary': 3,
            'University': 4
        }
        
        # Normalize education and income
        scaler = MinMaxScaler()
        normalized_education = scaler.fit_transform(
            self.raw_data['education'].map(education_mapping).values.reshape(-1, 1)
        )
        normalized_income = scaler.fit_transform(
            self.raw_data['quintile'].values.reshape(-1, 1)
        )
        
        # Weighted average (can adjust weights)
        education_weight = 0.6
        income_weight = 0.4
        
        financial_literacy_score = (
            education_weight * normalized_education + 
            income_weight * normalized_income
        )
        
        self.processed_data['financial_literacy_score'] = financial_literacy_score
        
    def cluster_financial_exclusion(self):
        """
        Cluster individuals into inclusion/exclusion groups
        """
        # Use K-Means clustering
        kmeans = KMeans(n_clusters=2, random_state=42)
        self.processed_data['exclusion_cluster'] = kmeans.fit_predict(self.processed_data)
        
        # Create binary exclusion label
        self.processed_data['financial_exclusion_label'] = (
            self.processed_data['exclusion_cluster'] == 
            kmeans.predict(self.processed_data.loc[
                self.processed_data['financial_engagement_score'].idxmin()
            ].values.reshape(1, -1)
        )[0])
        
    def train_classification_models(self):
        """
        Train multiple classification models to predict financial exclusion
        """
        X = self.processed_data.drop('financial_exclusion_label', axis=1)
        y = self.processed_data['financial_exclusion_label']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Standardize features for SVM and Logistic Regression
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Models to train
        models = {
            'Logistic Regression': LogisticRegression(max_iter=1000),
            'Random Forest': RandomForestClassifier(n_estimators=100),
            'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
            'SVM': SVC(probability=True)
        }
        
        results = {}
        for name, model in models.items():
            # Train model
            model.fit(X_train_scaled, y_train)
            
            # Predict and evaluate
            y_pred = model.predict(X_test_scaled)
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
            
            results[name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            }
            
            # SHAP Explanability
            if name in ['Random Forest', 'Logistic Regression']:
                explainer = shap.TreeExplainer(model) if name == 'Random Forest' else shap.LinearExplainer(model, X_train_scaled)
                shap_values = explainer.shap_values(X_test_scaled)
                
                results[name]['shap_summary'] = {
                    'mean_abs_shap_value': np.abs(shap_values).mean(),
                    'top_features': list(X.columns[np.abs(shap_values).mean(axis=0).argsort()[-5:][::-1]])
                }
        
        return results

# Example Usage
def main():
    # Assuming the CSV is named 'financial_inclusion_data.csv'
    analysis = FinancialExclusionAnalysis('updated_dataset.csv')
    
    # Run full analysis pipeline
    analysis.preprocess_data()
    analysis.cluster_financial_exclusion()
    results = analysis.train_classification_models()
    
    # Print results
    for model, metrics in results.items():
        print(f"\n{model} Results:")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")

if __name__ == '__main__':
    main()

In [2]:
import pandas as pd

# 1. Create a mapping of short column names to descriptive labels
columns_info_path = 'labels.xlsx'  # Replace with the actual file containing column names and labels
column_info = pd.read_excel(columns_info_path)  # This file contains Variable, Position, and Label

# Map short column names to detailed descriptions
column_mapping = dict(zip(column_info['Variable'], column_info['Label']))

# 2. Load the actual dataset
data_path = 'data2021.csv'  # Replace with the actual dataset file
data = pd.read_csv(data_path)

# 3. Rename the columns in the actual dataset
data.rename(columns=column_mapping, inplace=True)

# Check for duplicates and make labels unique if necessary
if data.columns.duplicated().any():
    data.columns = pd.io.parsers.ParserBase({'names': data.columns})._maybe_dedup_names(data.columns)

# Display the updated DataFrame with descriptive column names
print("Updated DataFrame with descriptive column names:")
print(data.head())

# Save the updated dataset if required
data.to_csv('updated_dataset.csv', index=False)


Updated DataFrame with descriptive column names:
  Unique 32-character long identifier of the interview   County  \
0                   0297462173dc461aa800fcec4e0bbe30    Mombasa   
1                   02e0d75a58ca4562ad79b82ce7219716    Mombasa   
2                   03451675e1444f859abf71c1aaec078d    Mombasa   
3                   04acd31535be49cb89db86e698ef1f1c    Mombasa   
4                   05dd5394d4f340ffb8bcb76e98ecbd36    Mombasa   

   Individual Weights Selected Respondent Gender Cluster Type (rural/urban)  \
0                1836                       Male                      Urban   
1                1506                       Male                      Urban   
2                1336                       Male                      Urban   
3                2493                       Male                      Urban   
4                1980                       Male                      Urban   

  Marital status of Respondent Education level of Respondent  \
0        

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import shap
import time


In [2]:
def load_data(data_path):
    return pd.read_csv(data_path)

data_path = 'updated_dataset.csv'

start_time = time.time()
raw_data = load_data(data_path)
print(f"Data loaded in {time.time() - start_time:.2f} seconds")


Data loaded in 0.75 seconds


In [None]:
def preprocess_data(data):
    # Handle missing values
    numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = data.select_dtypes(include=['object']).columns
    
    numeric_imputer = SimpleImputer(strategy='median')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    data[numeric_columns] = numeric_imputer.fit_transform(data[numeric_columns])
    data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])
    
    # Encode categorical variables
    categorical_encoded = pd.get_dummies(data[categorical_columns])
    label_encoder = LabelEncoder()
    high_cardinality_columns = [col for col in categorical_columns if data[col].nunique() > 10]
    for col in high_cardinality_columns:
        data[col + '_encoded'] = label_encoder.fit_transform(data[col])
    
    # Normalize numeric data
    scaler = MinMaxScaler()
    numeric_scaled = pd.DataFrame(scaler.fit_transform(data[numeric_columns]), columns=numeric_columns)
    
    # Combine all processed features
    processed_data = pd.concat([
        numeric_scaled,
        categorical_encoded,
        data[[col + '_encoded' for col in high_cardinality_columns]]
    ], axis=1)
    
    # Dimensionality reduction using PCA
    pca = PCA(n_components=0.95)
    processed_data = pd.DataFrame(
        pca.fit_transform(processed_data), 
        columns=[f'PC{i+1}' for i in range(pca.n_components_)]
    )
    
    return processed_data

start_time = time.time()
processed_data = preprocess_data(raw_data)
print(f"Data preprocessed in {time.time() - start_time:.2f} seconds")


In [None]:
def calculate_financial_engagement_score(data, processed_data):
    transaction_cols = [col for col in data.columns if 'transaction' in col.lower()]
    savings_cols = [col for col in data.columns if 'savings' in col.lower()]
    
    scaler = MinMaxScaler()
    normalized_transaction = scaler.fit_transform(data[transaction_cols])
    normalized_savings = scaler.fit_transform(data[savings_cols])
    
    financial_engagement_score = (
        0.6 * normalized_transaction.mean(axis=1) + 
        0.4 * normalized_savings.mean(axis=1)
    )
    processed_data['financial_engagement_score'] = financial_engagement_score
    return processed_data

start_time = time.time()
processed_data = calculate_financial_engagement_score(raw_data, processed_data)
print(f"Financial engagement score calculated in {time.time() - start_time:.2f} seconds")


In [None]:
def calculate_financial_literacy_score(data, processed_data):
    education_mapping = {'Primary': 1, 'Secondary': 2, 'Tertiary': 3, 'University': 4}
    
    scaler = MinMaxScaler()
    normalized_education = scaler.fit_transform(
        data['education'].map(education_mapping).values.reshape(-1, 1)
    )
    normalized_income = scaler.fit_transform(data['quintile'].values.reshape(-1, 1))
    
    financial_literacy_score = (
        0.6 * normalized_education + 
        0.4 * normalized_income
    )
    processed_data['financial_literacy_score'] = financial_literacy_score
    return processed_data

start_time = time.time()
processed_data = calculate_financial_literacy_score(raw_data, processed_data)
print(f"Financial literacy score calculated in {time.time() - start_time:.2f} seconds")


In [None]:
def cluster_financial_exclusion(processed_data):
    kmeans = KMeans(n_clusters=2, random_state=42)
    processed_data['exclusion_cluster'] = kmeans.fit_predict(processed_data)
    processed_data['financial_exclusion_label'] = (
        processed_data['exclusion_cluster'] == 
        kmeans.predict(processed_data.loc[
            processed_data['financial_engagement_score'].idxmin()
        ].values.reshape(1, -1))[0]
    )
    return processed_data

start_time = time.time()
processed_data = cluster_financial_exclusion(processed_data)
print(f"Financial exclusion clustered in {time.time() - start_time:.2f} seconds")


In [None]:
def train_classification_models(processed_data):
    X = processed_data.drop('financial_exclusion_label', axis=1)
    y = processed_data['financial_exclusion_label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
        'SVM': SVC(probability=True)
    }
    
    results = {}
    for name, model in models.items():
        model_start_time = time.time()
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        results[name] = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1}
        
        if name in ['Random Forest', 'Logistic Regression']:
            explainer = shap.TreeExplainer(model) if name == 'Random Forest' else shap.LinearExplainer(model, X_train_scaled)
            shap_values = explainer.shap_values(X_test_scaled)
            results[name]['shap_summary'] = {
                'mean_abs_shap_value': np.abs(shap_values).mean(),
                'top_features': list(X.columns[np.abs(shap_values).mean(axis=0).argsort()[-5:][::-1]])
            }
        
        print(f"{name} trained in {time.time() - model_start_time:.2f} seconds")
    return results

start_time = time.time()
results = train_classification_models(processed_data)
print(f"All models trained in {time.time() - start_time:.2f} seconds")


In [None]:
for model, metrics in results.items():
    print(f"\n{model} Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
