In [2]:
import pandas as pd
import numpy as np
import os

# File Paths
data_path = "/Users/jakubriha/Desktop/ECO225/Project/Data/"
loans_file = data_path + "loans.csv"
kiva_loans_file = data_path + "kiva_loans.csv"
country_stats_file = data_path + "country_stats.csv"
loan_coords_file = data_path + "loan_coords.csv"

In [4]:
# Load and Merge Data
df1 = pd.read_csv(loans_file)
df2 = pd.read_csv(kiva_loans_file).rename(columns={'id': 'loan_id'})
merged_df = df1.merge(df2, on='loan_id', how='inner')

In [5]:
# Drop rows with missing values in key columns
required_columns = ['funded_amount_x', 'loan_amount_x', 'country_code_x', 'borrower_genders_x']
filtered_df = merged_df.dropna(subset=required_columns).copy()

# Remove discrepancies in categorical columns
columns_to_check = ['country_code', 'currency', 'borrower_genders', 'repayment_interval']
for col in columns_to_check:
    filtered_df = filtered_df[filtered_df[f'{col}_x'] == filtered_df[f'{col}_y']]

# Keep only one version of duplicated columns
filtered_df = filtered_df.rename(columns={f'{col}_x': col for col in columns_to_check})
filtered_df.drop(columns=[f'{col}_y' for col in columns_to_check], inplace=True)

# Remove all variables with '_y' suffix and rename corresponding '_x' columns
columns_to_remove = [col for col in filtered_df.columns if col.endswith('_y')]
filtered_df.drop(columns=columns_to_remove, inplace=True)

columns_to_rename = {col: col[:-2] for col in filtered_df.columns if col.endswith('_x')}
filtered_df.rename(columns=columns_to_rename, inplace=True)

# Drop observations where 'raised_time' is empty
filtered_df.dropna(subset=['raised_time'], inplace=True)

In [6]:
# Merge with country stats
df_country = pd.read_csv(country_stats_file, usecols=['country_code', 'population', 'hdi'])
final_df = filtered_df.merge(df_country, on='country_code', how='left')

# Merge with loan coordinates
df_coords = pd.read_csv(loan_coords_file)
final_df = final_df.merge(df_coords, on='loan_id', how='left')

# Encode gender as dummy variable
final_df['gender_dummy'] = final_df['borrower_genders'].str.contains('female', case=False, na=False).astype(int)

# Drop empty observations for HDI and duplicate columns
# Drop empty observations for HDI, latitude, and longitude
final_df.dropna(subset=['hdi', 'latitude', 'longitude'], inplace=True)
final_df.drop(columns=['activity_name', 'sector_name', 'loan_use', 'currency_policy', 
                        'currency_exchange_coverage_rate', 'currency', 
                        'num_journal_entries', 'num_bulk_entries', 
                        'distribution_model', 'region'], inplace=True)

In [10]:
# Calculate funding time in days
final_df['funding_time_days'] = (pd.to_datetime(final_df['raised_time']) - pd.to_datetime(final_df['posted_time'])).dt.total_seconds() / (60 * 60 * 24)
final_df = final_df[final_df['funding_time_days'] > 0]
final_df['log_funding_time'] = np.log(final_df['funding_time_days'])

# Create sector fixed effects (one-hot encoding)
sector_dummies = pd.get_dummies(final_df['sector'], prefix='sector', drop_first=True)
final_df = pd.concat([final_df, sector_dummies], axis=1)

# Convert 'borrower_pictured' to numeric
final_df['borrower_pictured'] = pd.to_numeric(final_df['borrower_pictured'], errors='coerce')

# Aggregate the number of loans per country
loan_counts = final_df.groupby('country_code').size().reset_index(name='loan_count')

In [18]:
# Convert all sector dummy variables from bool to int
sector_columns = [col for col in final_df.columns if col.startswith('sector_')]
final_df[sector_columns] = final_df[sector_columns].astype(int)

In [24]:
# Summary statistics for each variable
summary_stats = pd.DataFrame({
    'Variable Name': final_df.columns,
    'Data Type': final_df.dtypes.values,
    'Unique Values': final_df.nunique().values,
    'Non-Null Count': final_df.notnull().sum().values,
    'Missing Values': final_df.isnull().sum().values
})

# Display the summary
summary_stats

Unnamed: 0,Variable Name,Data Type,Unique Values,Non-Null Count,Missing Values
0,loan_id,int64,550640,550640,0
1,loan_name,object,148221,550426,214
2,original_language,object,5,550640,0
3,description,object,550446,550637,3
4,description_translated,object,543417,543583,7057
5,funded_amount,float64,461,550640,0
6,loan_amount,float64,461,550640,0
7,status,object,1,550640,0
8,country_code,object,77,550640,0
9,country_name,object,77,550640,0


In [22]:
# Save the cleaned data
save_dir = data_path
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

final_df.to_csv(os.path.join(save_dir, "data_df.csv"), index=False)

print(f"Dataset saved to: {os.path.join(save_dir, 'data_df.csv')}")

Dataset saved to: /Users/jakubriha/Desktop/ECO225/Project/Data/data_df.csv
