# üîç Data Type Diagnostic Tool
## Run this FIRST to identify correct numeric vs categorical columns

This notebook will help us identify:
1. Which `object` columns should actually be numeric
2. Which numeric columns are truly continuous vs categorical/binary
3. What problematic values exist in each column

---

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load data
dt = pd.read_csv('data/Dataset.csv', low_memory=False)

print(f"Dataset Shape: {dt.shape}")
print(f"\nColumns: {len(dt.columns)}")

Dataset Shape: (121856, 40)

Columns: 40


---
## Step 1: Initial Data Types
---

In [2]:
print("Current Data Types:")
print("="*80)
for col in dt.columns:
    print(f"{col:35s} {str(dt[col].dtype):15s}")

Current Data Types:
ID                                  int64          
Client_Income                       object         
Car_Owned                           float64        
Bike_Owned                          float64        
Active_Loan                         float64        
House_Own                           float64        
Child_Count                         float64        
Credit_Amount                       object         
Loan_Annuity                        object         
Accompany_Client                    object         
Client_Income_Type                  object         
Client_Education                    object         
Client_Marital_Status               object         
Client_Gender                       object         
Loan_Contract_Type                  object         
Client_Housing_Type                 object         
Population_Region_Relative          object         
Age_Days                            object         
Employed_Days                       object  

In [3]:
print("\nData Type Distribution:")
print(dt.dtypes.value_counts())


Data Type Distribution:
object     20
float64    15
int64       5
Name: count, dtype: int64


---
## Step 2: Detailed Column Analysis
This will show for EACH column:
- Current dtype
- Number of unique values
- Sample of actual values
- Whether it can be converted to numeric
---

In [4]:
def analyze_column(df, col):
    """Detailed analysis of a single column"""
    print(f"\n{'='*80}")
    print(f"Column: {col}")
    print(f"{'='*80}")
    
    # Basic info
    print(f"Current dtype: {df[col].dtype}")
    print(f"Null count: {df[col].isnull().sum():,} ({df[col].isnull().sum()/len(df)*100:.2f}%)")
    print(f"Unique values: {df[col].nunique():,}")
    
    # Sample non-null values
    sample_values = df[col].dropna().unique()[:10]
    print(f"\nSample values (first 10):")
    print(sample_values)
    
    # Value counts for low cardinality
    if df[col].nunique() <= 20:
        print(f"\nValue counts:")
        print(df[col].value_counts())
    
    # Try numeric conversion if object
    if df[col].dtype == 'object':
        print(f"\n--- NUMERIC CONVERSION TEST ---")
        try:
            numeric_test = pd.to_numeric(df[col], errors='coerce')
            successfully_converted = (~numeric_test.isna()).sum()
            total_non_null = df[col].notna().sum()
            conversion_rate = (successfully_converted / total_non_null * 100) if total_non_null > 0 else 0
            
            print(f"Can convert to numeric: {successfully_converted:,} / {total_non_null:,} ({conversion_rate:.1f}%)")
            
            if conversion_rate > 95:
                print(f"‚úÖ RECOMMENDATION: CONVERT TO NUMERIC")
                # Show what would become NaN
                problematic = df[col][numeric_test.isna() & df[col].notna()]
                if len(problematic) > 0:
                    print(f"\nProblematic non-numeric values:")
                    print(problematic.unique()[:10])
            elif conversion_rate > 50:
                print(f"‚ö†Ô∏è  MIXED TYPE - needs investigation")
                problematic = df[col][numeric_test.isna() & df[col].notna()]
                if len(problematic) > 0:
                    print(f"\nProblematic non-numeric values ({len(problematic)}):")
                    print(problematic.value_counts())
            else:
                print(f"‚úÖ RECOMMENDATION: KEEP AS CATEGORICAL")
        except Exception as e:
            print(f"‚ùå Error testing numeric conversion: {e}")
    
    # For numeric columns, check distribution
    elif df[col].dtype in ['int64', 'float64']:
        print(f"\n--- NUMERIC STATISTICS ---")
        print(df[col].describe())
        
        # Check if it's actually binary/categorical
        if df[col].nunique() <= 10:
            print(f"\n‚ö†Ô∏è  LOW CARDINALITY (‚â§10 unique values)")
            print(f"This might be a categorical/binary flag, not continuous numeric")
            unique_vals = df[col].dropna().unique()
            print(f"Unique values: {sorted(unique_vals)}")
            
            # Check if binary (0/1)
            if set(unique_vals).issubset({0, 1, 0.0, 1.0}):
                print(f"‚úÖ This is a BINARY FLAG (0/1)")

# Run analysis on all columns
print("\n" + "#"*80)
print("# STARTING DETAILED COLUMN ANALYSIS")
print("#"*80)

for col in dt.columns:
    analyze_column(dt, col)


################################################################################
# STARTING DETAILED COLUMN ANALYSIS
################################################################################

Column: ID
Current dtype: int64
Null count: 0 (0.00%)
Unique values: 121,856

Sample values (first 10):
[12142509 12138936 12181264 12188929 12133385 12191614 12128086 12215264
 12159147 12130547]

--- NUMERIC STATISTICS ---
count    1.218560e+05
mean     1.216093e+07
std      3.517694e+04
min      1.210000e+07
25%      1.213046e+07
50%      1.216093e+07
75%      1.219139e+07
max      1.222186e+07
Name: ID, dtype: float64

Column: Client_Income
Current dtype: object
Null count: 3,607 (2.96%)
Unique values: 1,217

Sample values (first 10):
['6750' '20250' '18000' '15750' '33750' '11250' '13500' '12150' '27000'
 '8100']

--- NUMERIC CONVERSION TEST ---
Can convert to numeric: 118,234 / 118,249 (100.0%)
‚úÖ RECOMMENDATION: CONVERT TO NUMERIC

Problematic non-numeric values:
['$']

Column: Car

---
## Step 3: Summary of Recommendations
---

In [5]:
print("\n" + "="*80)
print("COLUMN TYPE RECOMMENDATIONS")
print("="*80)

should_be_numeric = []
truly_numeric = []
binary_flags = []
truly_categorical = []

for col in dt.columns:
    if col == 'Default':
        continue
        
    if dt[col].dtype == 'object':
        # Test numeric conversion
        try:
            numeric_test = pd.to_numeric(dt[col], errors='coerce')
            conversion_rate = ((~numeric_test.isna()).sum() / dt[col].notna().sum() * 100)
            
            if conversion_rate > 95:
                should_be_numeric.append((col, conversion_rate))
            else:
                truly_categorical.append(col)
        except:
            truly_categorical.append(col)
    
    elif dt[col].dtype in ['int64', 'float64']:
        unique_vals = dt[col].dropna().unique()
        if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1, 0.0, 1.0}):
            binary_flags.append(col)
        else:
            truly_numeric.append(col)

print(f"\n‚úÖ SHOULD BE CONVERTED TO NUMERIC ({len(should_be_numeric)} columns):")
for col, rate in should_be_numeric:
    print(f"  - {col:35s} ({rate:.1f}% convertible)")

print(f"\n‚úÖ ALREADY NUMERIC - CONTINUOUS ({len(truly_numeric)} columns):")
for col in truly_numeric:
    print(f"  - {col}")

print(f"\n‚úÖ ALREADY NUMERIC - BINARY FLAGS ({len(binary_flags)} columns):")
for col in binary_flags:
    print(f"  - {col}")

print(f"\n‚úÖ TRULY CATEGORICAL ({len(truly_categorical)} columns):")
for col in truly_categorical:
    print(f"  - {col}")

print(f"\n" + "="*80)
print(f"TOTAL: {len(should_be_numeric) + len(truly_numeric) + len(binary_flags)} NUMERIC")
print(f"TOTAL: {len(truly_categorical)} CATEGORICAL")
print("="*80)


COLUMN TYPE RECOMMENDATIONS

‚úÖ SHOULD BE CONVERTED TO NUMERIC (9 columns):
  - Client_Income                       (100.0% convertible)
  - Credit_Amount                       (100.0% convertible)
  - Loan_Annuity                        (100.0% convertible)
  - Population_Region_Relative          (100.0% convertible)
  - Age_Days                            (100.0% convertible)
  - Employed_Days                       (100.0% convertible)
  - Registration_Days                   (100.0% convertible)
  - ID_Days                             (100.0% convertible)
  - Score_Source_3                      (100.0% convertible)

‚úÖ ALREADY NUMERIC - CONTINUOUS (12 columns):
  - ID
  - Child_Count
  - Own_House_Age
  - Client_Family_Members
  - Cleint_City_Rating
  - Application_Process_Day
  - Application_Process_Hour
  - Score_Source_1
  - Score_Source_2
  - Social_Circle_Default
  - Phone_Change
  - Credit_Bureau

‚úÖ ALREADY NUMERIC - BINARY FLAGS (7 columns):
  - Car_Owned
  - Bike_Owned
 

---
## Step 4: Create Type Mapping Dictionary
This can be copied directly into the preprocessing code
---

In [6]:
print("\n# Copy this into your preprocessing code:")
print("# " + "="*70)
print("\nCOLUMN_TYPES = {")
print("    'force_numeric': [")
for col, rate in should_be_numeric:
    print(f"        '{col}',  # {rate:.1f}% convertible")
print("    ],")
print("    'continuous_numeric': [")
for col in truly_numeric:
    print(f"        '{col}',")
print("    ],")
print("    'binary_flags': [")
for col in binary_flags:
    print(f"        '{col}',")
print("    ],")
print("    'categorical': [")
for col in truly_categorical:
    print(f"        '{col}',")
print("    ]")
print("}")


# Copy this into your preprocessing code:

COLUMN_TYPES = {
    'force_numeric': [
        'Client_Income',  # 100.0% convertible
        'Credit_Amount',  # 100.0% convertible
        'Loan_Annuity',  # 100.0% convertible
        'Population_Region_Relative',  # 100.0% convertible
        'Age_Days',  # 100.0% convertible
        'Employed_Days',  # 100.0% convertible
        'Registration_Days',  # 100.0% convertible
        'ID_Days',  # 100.0% convertible
        'Score_Source_3',  # 100.0% convertible
    ],
    'continuous_numeric': [
        'ID',
        'Child_Count',
        'Own_House_Age',
        'Client_Family_Members',
        'Cleint_City_Rating',
        'Application_Process_Day',
        'Application_Process_Hour',
        'Score_Source_1',
        'Score_Source_2',
        'Social_Circle_Default',
        'Phone_Change',
        'Credit_Bureau',
    ],
    'binary_flags': [
        'Car_Owned',
        'Bike_Owned',
        'Active_Loan',
        'House_Own',
      

---
## Step 5: Check for Problematic Values
---

In [7]:
print("\n" + "="*80)
print("PROBLEMATIC VALUES CHECK")
print("="*80)

# Check object columns that should be numeric
for col, rate in should_be_numeric:
    numeric_test = pd.to_numeric(dt[col], errors='coerce')
    problematic = dt[col][numeric_test.isna() & dt[col].notna()]
    
    if len(problematic) > 0:
        print(f"\n{col}:")
        print(f"  Problematic values: {len(problematic)}")
        print(f"  Unique problematic: {problematic.nunique()}")
        print(f"  Examples: {problematic.unique()[:10]}")


PROBLEMATIC VALUES CHECK

Client_Income:
  Problematic values: 15
  Unique problematic: 1
  Examples: ['$']

Credit_Amount:
  Problematic values: 5
  Unique problematic: 1
  Examples: ['$']

Loan_Annuity:
  Problematic values: 14
  Unique problematic: 2
  Examples: ['$' '#VALUE!']

Population_Region_Relative:
  Problematic values: 11
  Unique problematic: 2
  Examples: ['@' '#']

Age_Days:
  Problematic values: 17
  Unique problematic: 1
  Examples: ['x']

Employed_Days:
  Problematic values: 17
  Unique problematic: 1
  Examples: ['x']

Registration_Days:
  Problematic values: 17
  Unique problematic: 1
  Examples: ['x']

ID_Days:
  Problematic values: 17
  Unique problematic: 1
  Examples: ['x']

Score_Source_3:
  Problematic values: 1
  Unique problematic: 1
  Examples: ['&']


---
## ‚úÖ Next Steps

After running this diagnostic:
1. Review the recommendations above
2. Copy the `COLUMN_TYPES` dictionary
3. Share the results so I can update the preprocessing code
4. We'll create the final preprocessing pipeline with correct types

---