### **#application_metadata.csv tayyorlash**

In [79]:
# Kerakli kutubxonalarni import qilish
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pyarrow.parquet as pq
import re
from sklearn.preprocessing import RobustScaler
import xml.etree.ElementTree as ET
import glob


In [80]:
# CSV faylni yuklash
csv_data = pd.read_csv('data/application_metadata.csv')  # fayl nomini moslashtiring
csv_data.head()

# Duplicate va missing values tekshirish
print("Duplicate qatorlar:", csv_data.duplicated().sum())
print("Missing values:\n", csv_data.isnull().sum())


Duplicate qatorlar: 0
Missing values:
 customer_ref                  0
application_id                0
application_hour              0
application_day_of_week       0
account_open_year             0
preferred_contact             0
referral_code                 0
account_status_code           0
random_noise_1                0
num_login_sessions            0
num_customer_service_calls    0
has_mobile_app                0
paperless_billing             0
default                       0
dtype: int64


In [81]:
df_csv = csv_data.drop(columns=['application_id', 'random_noise_1'])
df_csv.rename(columns={'customer_ref': 'customer_id'}, inplace=True)
print(df_csv.head())

   customer_id  application_hour  application_day_of_week  account_open_year  \
0        10000                 5                        6               2013   
1        10001                 4                        2               2015   
2        10002                10                        3               2020   
3        10003                 7                        5               2010   
4        10004                 1                        2               2020   

  preferred_contact referral_code account_status_code  num_login_sessions  \
0              Mail       REF0000               ACT-2                  13   
1             Phone       REF0000               ACT-3                   6   
2             Phone       REF0000               ACT-3                   1   
3             Email       REF0000                 A01                   4   
4              Mail       REF0000               ACT-3                   6   

   num_customer_service_calls  has_mobile_app  paperless

In [82]:
# CSV sifatida saqlash
df_csv.to_csv('data_new/application_metadata.csv', index=False)


### **#credit_history.parquet**

In [83]:
table = pq.read_table('data/credit_history.parquet', use_pandas_metadata=True)
df_parquet = table.to_pandas()
#parquet fileni o'qish
print(df_parquet.head())


   customer_number  credit_score  num_credit_accounts  oldest_credit_line_age  \
0            10000           696                   14                    22.8   
1            10001           659                   13                     3.5   
2            10002           662                    3                     0.0   
3            10003           676                    8                     9.0   
4            10004           678                    7                     8.0   

   oldest_account_age_months  total_credit_limit  num_delinquencies_2yrs  \
0                      273.6            169100.0                     0.0   
1                       42.0             78200.0                     0.0   
2                        0.0             41400.0                     0.0   
3                      108.0             60000.0                     0.0   
4                       96.0             49700.0                     0.0   

   num_inquiries_6mo  recent_inquiry_count  num_public_r

In [84]:
# missing valuelarni aniqlash
df_parquet['num_delinquencies_2yrs'].value_counts(dropna=False)
df_parquet['num_delinquencies_2yrs'] = df_parquet['num_delinquencies_2yrs'].replace(-0.02, pd.NA)
df_parquet.isna().sum()



customer_number                0
credit_score                   0
num_credit_accounts            0
oldest_credit_line_age         0
oldest_account_age_months      0
total_credit_limit             0
num_delinquencies_2yrs       832
num_inquiries_6mo              0
recent_inquiry_count           0
num_public_records             0
num_collections                0
account_diversity_index        0
dtype: int64

In [85]:
df_parquet['num_delinquencies_2yrs'] = df_parquet['num_delinquencies_2yrs'].fillna(df_parquet['num_delinquencies_2yrs'].mode()[0])

df_parquet.isna().sum()
# missing valuelarni eng ko'p takrorlangan qiymatlar bilan to'ldirdi

customer_number              0
credit_score                 0
num_credit_accounts          0
oldest_credit_line_age       0
oldest_account_age_months    0
total_credit_limit           0
num_delinquencies_2yrs       0
num_inquiries_6mo            0
recent_inquiry_count         0
num_public_records           0
num_collections              0
account_diversity_index      0
dtype: int64

In [86]:
df_parquet.rename(columns={'customer_number': 'customer_id'}, inplace=True)
#o'xshash ustunlarni birxil qilib olish
df_parquet

Unnamed: 0,customer_id,credit_score,num_credit_accounts,oldest_credit_line_age,oldest_account_age_months,total_credit_limit,num_delinquencies_2yrs,num_inquiries_6mo,recent_inquiry_count,num_public_records,num_collections,account_diversity_index
0,10000,696,14,22.8,273.6,169100.0,0.0,2,2,1,0,0.499
1,10001,659,13,3.5,42.0,78200.0,0.0,6,6,0,0,0.298
2,10002,662,3,0.0,0.0,41400.0,0.0,2,2,0,0,0.174
3,10003,676,8,9.0,108.0,60000.0,0.0,1,1,0,0,0.263
4,10004,678,7,8.0,96.0,49700.0,0.0,1,1,0,0,0.298
...,...,...,...,...,...,...,...,...,...,...,...,...
89994,99994,817,10,8.2,98.4,135600.0,0.0,1,1,0,0,0.285
89995,99995,745,9,8.8,105.6,44600.0,0.0,1,1,0,0,0.353
89996,99996,607,11,1.0,12.0,18300.0,0.0,2,2,1,0,0.238
89997,99997,678,10,3.0,36.0,54300.0,0.0,3,3,0,0,0.227


In [87]:
# CSV sifatida saqlash
df_parquet.to_csv('data_new/credit_history.csv', index=False)


### **# demographics.csv**

In [88]:
df_demo = pd.read_csv('data/demographics.csv')
df_demo.head()

Unnamed: 0,cust_id,age,annual_income,employment_length,employment_type,education,marital_status,num_dependents
0,10000,41,$61800,2.2,Full-time,Graduate,Married,2
1,10001,38,28600,7.0,FULL_TIME,High School,Married,0
2,10002,18,"$20,700",0.8,FULL_TIME,Bachelor,Single,0
3,10003,27,31400,4.8,Full Time,Bachelor,Single,0
4,10004,26,$24600,5.2,Fulltime,High School,Single,0


In [89]:
# ummumiy tekshiruv
print("Duplicate qatorlar:", df_demo.duplicated().sum())
print("Missing values:\n", df_demo.isna().sum())
print(df_demo.info())
print(df_demo.describe())


Duplicate qatorlar: 0
Missing values:
 cust_id                 0
age                     0
annual_income           0
employment_length    2253
employment_type         0
education               0
marital_status          0
num_dependents          0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89999 entries, 0 to 89998
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cust_id            89999 non-null  int64  
 1   age                89999 non-null  int64  
 2   annual_income      89999 non-null  object 
 3   employment_length  87746 non-null  float64
 4   employment_type    89999 non-null  object 
 5   education          89999 non-null  object 
 6   marital_status     89999 non-null  object 
 7   num_dependents     89999 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 5.5+ MB
None
            cust_id           age  employment_length  num_dependents
count  89999.000

In [90]:
# barcha qiymatlarni bir xil ko'rinishga keltirish uchun
df_demo['annual_income'] = df_demo['annual_income'].astype(str)  # avval stringga o'tkazamiz
df_demo['annual_income'] = df_demo['annual_income'].str.replace('$', '', regex=False)
df_demo['annual_income'] = df_demo['annual_income'].str.replace(',', '', regex=False)
df_demo['annual_income'] = df_demo['annual_income'].astype(float)

# dollor belgisini olib tashlash uchun
df_demo.rename(columns={'annual_income': 'annual_income_$'}, inplace=True)

# tekshirish
print(df_demo[['annual_income_$']].head())


   annual_income_$
0          61800.0
1          28600.0
2          20700.0
3          31400.0
4          24600.0


In [91]:
median_employment_length = df_demo['employment_length'].median()
df_demo['employment_length'] = df_demo['employment_length'].fillna(median_employment_length)
# bo'sh qiymatlarni to'ldirish mediana qiymat bilan
print(df_demo[ 'employment_length'].head())
print("Missing values:\n", df_demo.isna().sum())

0    2.2
1    7.0
2    0.8
3    4.8
4    5.2
Name: employment_length, dtype: float64
Missing values:
 cust_id              0
age                  0
annual_income_$      0
employment_length    0
employment_type      0
education            0
marital_status       0
num_dependents       0
dtype: int64


In [92]:
df_demo.rename(columns={'cust_id': 'customer_id'}, inplace=True)
# o'xshash ustunlarni birxillashtirish

In [93]:
# df_demo da employment_type ustunidagi noyob qiymatlarni chiqarish
print(df_demo['employment_type'].unique())

# yoki sorted qilib ko'rish
print(sorted(df_demo['employment_type'].dropna().unique()))


['Full-time' 'FULL_TIME' 'Full Time' 'Fulltime' 'Part Time'
 'Self Employed' 'FT' 'Self Emp' 'Contractor' 'Self-employed'
 'SELF_EMPLOYED' 'Contract' 'PART_TIME' 'CONTRACT' 'PT' 'Part-time']
['CONTRACT', 'Contract', 'Contractor', 'FT', 'FULL_TIME', 'Full Time', 'Full-time', 'Fulltime', 'PART_TIME', 'PT', 'Part Time', 'Part-time', 'SELF_EMPLOYED', 'Self Emp', 'Self Employed', 'Self-employed']


In [94]:
# employment_type ustunini standartlashtirish
df_demo['employment_type'] = df_demo['employment_type'].replace({
    'Full-time': 'Full Time',
    'FULL_TIME': 'Full Time',
    'Full Time': 'Full Time',
    'Fulltime': 'Full Time',
    'FT': 'Full Time',
    'Part Time': 'Part Time',
    'PART_TIME': 'Part Time',
    'Part-time': 'Part Time',
    'PT': 'Part Time',
    'Self Employed': 'Self Employed',
    'Self Emp': 'Self Employed',
    'Self-employed': 'Self Employed',
    'SELF_EMPLOYED': 'Self Employed',
    'Contractor': 'Contractor',
    'Contract': 'Contractor',
    'CONTRACT': 'Contractor'
})

# Natijani tekshirish
print(sorted(df_demo['employment_type'].unique()))


['Contractor', 'Full Time', 'Part Time', 'Self Employed']


In [95]:
df_demo.to_csv("data_new/demographics_cleaned.csv", index=False)
# tozalangan datani saqlash

### **# financial_ratios.jsonl**

In [96]:
df_fin = pd.read_json("data/financial_ratios.jsonl", lines=True)

print("Fayl muvaffaqiyatli o‘qildi!")
print(df_fin.head())
print(df_fin.info())

Fayl muvaffaqiyatli o‘qildi!
   cust_num monthly_income existing_monthly_debt monthly_payment  \
0     10000       5,150.00                738.64         $592.13   
1     10001       2,383.33                392.21        1,013.86   
2     10002       1,725.00                204.07         $317.81   
3     10003       2,616.67               $288.71          234.52   
4     10004       2,050.00               $248.77          334.81   

   debt_to_income_ratio  debt_service_ratio  payment_to_income_ratio  \
0                 0.258            0.258402                    0.115   
1                 0.590            0.589959                    0.425   
2                 0.303            0.302539                    0.184   
3                 0.200            0.199961                    0.090   
4                 0.285            0.284673                    0.163   

   credit_utilization revolving_balance credit_usage_amount available_credit  \
0               0.841       $142,213.10         $

In [97]:
df_fin

Unnamed: 0,cust_num,monthly_income,existing_monthly_debt,monthly_payment,debt_to_income_ratio,debt_service_ratio,payment_to_income_ratio,credit_utilization,revolving_balance,credit_usage_amount,available_credit,total_monthly_debt_payment,annual_debt_payment,loan_to_annual_income,total_debt_amount,monthly_free_cash_flow
0,10000,5150.00,738.64,$592.13,0.258,0.258402,0.115,0.841,"$142,213.10","$142,213.10","$26,886.90",1330.77,15969.24,0.286408,159913.10,3819.23
1,10001,2383.33,392.21,1013.86,0.590,0.589959,0.425,0.971,"$75,932.20",75932.20,"$2,267.80",1406.07,16872.84,3.986014,189932.20,$977.26
2,10002,1725.00,204.07,$317.81,0.303,0.302539,0.184,0.539,22314.6,22314.60,"$19,085.40",521.88,6262.56,0.449275,31614.6,1203.12
3,10003,2616.67,$288.71,234.52,0.200,0.199961,0.090,0.147,8820.00,8820.0,51180.00,523.23,6278.76,0.277070,"$17,520.00",2093.4366666666665
4,10004,2050.00,$248.77,334.81,0.285,0.284673,0.163,0.488,24253.6,24253.6,25446.40,583.58,7002.96,0.292683,"$31,453.60",1466.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89994,99994,6025.00,2120.97,519.87,0.438,0.438314,0.086,0.359,48680.40,48680.40,"$86,919.60",2640.84,31690.08,0.156293,"$59,980.40","$3,384.16"
89995,99995,"$1,666.67",138.97,$545.44,0.411,0.410646,0.327,0.355,15833.00,15833.0,"$28,767.00",$684.41,8212.92,0.825000,32333.0,982.26
89996,99996,1666.67,$129.90,616.96,0.448,0.448116,0.370,0.669,12242.70,12242.7,"$6,057.30",$746.86,8962.32,0.890000,30042.7,919.8066666666667
89997,99997,"$1,666.67",162.11,$351.00,0.308,0.307866,0.211,0.562,"$30,516.60",30516.60,23783.4,513.11,6157.32,0.585000,42216.6,"$1,153.56"


In [98]:
dup_count = df_fin.duplicated().sum()
print("Duplicate qatorlar soni:", dup_count)
# dublicatelarni aniqlash 
print("Missing values:\n", df_fin.isna().sum())

Duplicate qatorlar soni: 0
Missing values:
 cust_num                         0
monthly_income                   0
existing_monthly_debt            0
monthly_payment                  0
debt_to_income_ratio             0
debt_service_ratio               0
payment_to_income_ratio          0
credit_utilization               0
revolving_balance             1377
credit_usage_amount              0
available_credit                 0
total_monthly_debt_payment       0
annual_debt_payment              0
loan_to_annual_income            0
total_debt_amount                0
monthly_free_cash_flow           0
dtype: int64


In [99]:
# dollordagi ustunlar
usd_columns = [
    "monthly_income",
    "existing_monthly_debt",
    "monthly_payment",
    "revolving_balance",
    "credit_usage_amount",
    "available_credit",
    "total_monthly_debt_payment",
    "total_debt_amount",
    "monthly_free_cash_flow"
]

# sonli ustunlar 
ratio_columns = [
    "debt_to_income_ratio",
    "debt_service_ratio",
    "payment_to_income_ratio",
    "credit_utilization",
    "annual_debt_payment",
    "loan_to_annual_income"
]

all_numeric_cols = usd_columns + ratio_columns



def clean_numeric(x, decimals=4):
    if pd.isna(x):
        return None
    x = str(x).strip()
    
   
    x = re.sub(r"[^0-9,.\-]", "", x)
    
    if x in ["", "-", "--"]:
        return None
    
 
    if x.count(",") + x.count(".") > 1:
        seps = [i for i, c in enumerate(x) if c in ",."]
        last = seps[-1]
        cleaned = ""
        for i, c in enumerate(x):
            if i == last:
                cleaned += "."
            elif c in ",.":
                continue
            else:
                cleaned += c
        x = cleaned
    else:
        # Aralash format: "," → "." 
        if "," in x and "." in x:
            x = x.replace(",", "")
        elif "," in x:
            x = x.replace(",", ".")
    
    try:
        return float(f"{float(x):.{decimals}f}")
    except:
        return None



def clean_all_numeric(df_fin):
    df_fin = df_fin.copy()
    
    
    for col in usd_columns:
        df_fin[col] = df_fin[col].apply(lambda x: clean_numeric(x, decimals=2))
    
   
    for col in ratio_columns:
        df_fin[col] = df_fin[col].apply(lambda x: clean_numeric(x, decimals=4))
    
    #
    df_fin.rename(columns={col: col + "_$" for col in usd_columns}, inplace=True)
    
    return df_fin

df_fin.rename(columns={'cust_num': 'customer_id'}, inplace=True)

df_fin = clean_all_numeric(df_fin)



In [100]:
df_fin

Unnamed: 0,customer_id,monthly_income_$,existing_monthly_debt_$,monthly_payment_$,debt_to_income_ratio,debt_service_ratio,payment_to_income_ratio,credit_utilization,revolving_balance_$,credit_usage_amount_$,available_credit_$,total_monthly_debt_payment_$,annual_debt_payment,loan_to_annual_income,total_debt_amount_$,monthly_free_cash_flow_$
0,10000,5150.00,738.64,592.13,0.258,0.2584,0.115,0.841,142213.1,142213.1,26886.9,1330.77,15969.24,0.2864,159913.1,3819.23
1,10001,2383.33,392.21,1013.86,0.590,0.5900,0.425,0.971,75932.2,75932.2,2267.8,1406.07,16872.84,3.9860,189932.2,977.26
2,10002,1725.00,204.07,317.81,0.303,0.3025,0.184,0.539,22314.6,22314.6,19085.4,521.88,6262.56,0.4493,31614.6,1203.12
3,10003,2616.67,288.71,234.52,0.200,0.2000,0.090,0.147,8820.0,8820.0,51180.0,523.23,6278.76,0.2771,17520.0,2093.44
4,10004,2050.00,248.77,334.81,0.285,0.2847,0.163,0.488,24253.6,24253.6,25446.4,583.58,7002.96,0.2927,31453.6,1466.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89994,99994,6025.00,2120.97,519.87,0.438,0.4383,0.086,0.359,48680.4,48680.4,86919.6,2640.84,31690.08,0.1563,59980.4,3384.16
89995,99995,1666.67,138.97,545.44,0.411,0.4106,0.327,0.355,15833.0,15833.0,28767.0,684.41,8212.92,0.8250,32333.0,982.26
89996,99996,1666.67,129.90,616.96,0.448,0.4481,0.370,0.669,12242.7,12242.7,6057.3,746.86,8962.32,0.8900,30042.7,919.81
89997,99997,1666.67,162.11,351.00,0.308,0.3079,0.211,0.562,30516.6,30516.6,23783.4,513.11,6157.32,0.5850,42216.6,1153.56


In [101]:
df_fin["revolving_balance_$"] = df_fin["revolving_balance_$"].fillna(
    df_fin["revolving_balance_$"].median()
)
#missing valuelarni to'ldiradi

In [102]:
df_fin.to_csv("data_new/financial_ratios.csv", index=False)
# tozalangan datani saqlash

### **# geographic_data.xml**

In [103]:
import pandas as pd

xml_file = "data/geographic_data.xml"

df_xml = pd.read_xml(xml_file)  
print(df_xml.shape)
display(df_xml.head())


(89999, 8)


Unnamed: 0,id,state,regional_unemployment_rate,regional_median_income,regional_median_rent,housing_price_index,cost_of_living_index,previous_zip_code
0,10000,OH,4.8,56000,1380.0,91.0,73.0,451
1,10001,PA,4.4,61000,1510.0,92.0,87.0,537
2,10002,VA,3.9,74000,1920.0,125.0,103.0,679
3,10003,CA,5.8,75000,1690.0,158.0,121.0,719
4,10004,WA,5.8,78000,1700.0,152.0,127.0,933


In [104]:
df_xml.isna().sum()


id                            0
state                         0
regional_unemployment_rate    0
regional_median_income        0
regional_median_rent          0
housing_price_index           0
cost_of_living_index          0
previous_zip_code             0
dtype: int64

In [105]:
# 1. 'id' ustunini 'customer_id' deb nomlash
df_xml = df_xml.rename(columns={"id": "customer_id"})

# 2. Keraksiz ustunlarni olib tashlash
df_xml = df_xml.drop(columns=["previous_zip_code"])

# 3. Numeric ustunlarni standartlashtirish (float, 2 decimal)
numeric_cols = [
    "regional_unemployment_rate",
    "regional_median_income",
    "regional_median_rent",
    "housing_price_index",
    "cost_of_living_index"
]

for col in numeric_cols:
    df_xml[col] = df_xml[col].apply(lambda x: round(float(x), 2) if pd.notnull(x) else None)

# 4. Kategorik ustunlarni saqlash
categorical_cols = ["state"]

df_xml


Unnamed: 0,customer_id,state,regional_unemployment_rate,regional_median_income,regional_median_rent,housing_price_index,cost_of_living_index
0,10000,OH,4.8,56000.0,1380.0,91.0,73.0
1,10001,PA,4.4,61000.0,1510.0,92.0,87.0
2,10002,VA,3.9,74000.0,1920.0,125.0,103.0
3,10003,CA,5.8,75000.0,1690.0,158.0,121.0
4,10004,WA,5.8,78000.0,1700.0,152.0,127.0
...,...,...,...,...,...,...,...
89994,99994,CA,5.8,75000.0,1830.0,168.0,138.0
89995,99995,IL,4.9,65000.0,1900.0,99.0,84.0
89996,99996,CA,5.4,75000.0,1560.0,157.0,127.0
89997,99997,GA,4.3,58000.0,1310.0,103.0,92.0


In [106]:
df_xml.to_csv("data_new/geographic_data.csv", index=False)
# tozalangan datani saqlash

### **# loan_details.xlsx**

In [107]:
df_xlsx = pd.read_excel("data/loan_details.xlsx")
print("Null qiymatlar:")
print(df_xlsx.isna().sum())


Null qiymatlar:
customer_id            0
loan_type              0
loan_amount            0
loan_term              0
interest_rate          0
loan_purpose           0
loan_to_value_ratio    0
origination_channel    0
loan_officer_id        0
marketing_campaign     0
dtype: int64


In [108]:
df_xlsx['loan_amount'] = df_xlsx['loan_amount'].astype(str).str.replace(r'[^0-9.]', '', regex=True).astype(float)

# loan_type ustunini bir xil formatga keltirish
def clean_loan_type(x):
    x = str(x).strip().lower()  # kichik harfga o'tkazamiz
    if x in ["personal", "personal loan"]:
        return "personal"
    elif x in ["mortgage", "home loan"]:
        return "mortgage"
    elif x in ["credit card", "creditcard", "cc"]:
        return "credit_card"
    else:
        return x  # agar boshqa tur bo'lsa, o'zgarmaydi

# ustunga qo'llash
df_xlsx['loan_type'] = df_xlsx['loan_type'].apply(clean_loan_type)
#kerakmas ustunlarni olib tashlash 
df_xlsx = df_xlsx.drop(columns=['marketing_campaign', 'loan_officer_id'])

df_xlsx

Unnamed: 0,customer_id,loan_type,loan_amount,loan_term,interest_rate,loan_purpose,loan_to_value_ratio,origination_channel
0,10000,personal,17700.0,36,12.50,Debt Consolidation,0.000,Direct Mail
1,10001,mortgage,114000.0,180,6.83,Refinance,0.774,Branch
2,10002,personal,9300.0,36,13.99,Major Purchase,0.000,Online
3,10003,personal,8700.0,48,13.26,Medical,0.000,Online
4,10004,personal,7200.0,24,10.77,Debt Consolidation,0.000,Branch
...,...,...,...,...,...,...,...,...
89994,99994,personal,11300.0,24,9.70,Home Improvement,0.000,Branch
89995,99995,personal,16500.0,36,11.67,Other,0.000,Branch
89996,99996,personal,17800.0,36,14.99,Major Purchase,0.000,Online
89997,99997,credit_card,11700.0,0,19.10,Revolving Credit,0.000,Branch


In [109]:
df_xlsx.to_csv("data_new/loan_details.csv", index=False)
# tozalangan datani saqlash
df_xlsx.shape

(89999, 8)

### **dataset tayyorlash**

In [110]:
import pandas as pd

# Fayllarni o'qish
app_meta = pd.read_csv("data_new/application_metadata.csv")
credit_hist = pd.read_csv("data_new/credit_history.csv")
demo = pd.read_csv("data_new/demographics_cleaned.csv")
fin_ratios = pd.read_csv("data_new/financial_ratios.csv")
geo = pd.read_csv("data_new/geographic_data.csv")
loan = pd.read_csv("data_new/loan_details.csv")

# Index bo‘yicha birlashtirish
dfs = [app_meta, credit_hist, demo, fin_ratios, geo, loan]
df_merged = pd.concat(dfs, axis=1)

print(df_merged.shape)


(89999, 63)


In [111]:
df_merged = app_meta.merge(credit_hist, on="customer_id", how="left")\
                    .merge(demo, on="customer_id", how="left")\
                    .merge(fin_ratios, on="customer_id", how="left")\
                    .merge(geo, on="customer_id", how="left")\
                    .merge(loan, on="customer_id", how="left")
