In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# File Paths
data_path = "/Users/jakubriha/Desktop/ECO225/Project/Data/"
loans_file = data_path + "loans.csv"
kiva_loans_file = data_path + "kiva_loans.csv"
country_stats_file = data_path + "country_stats.csv"


In [6]:
# Load Data
df1 = pd.read_csv(loans_file)
df2 = pd.read_csv(kiva_loans_file).rename(columns={'id': 'loan_id'})

# Merge datasets on loan_id
merged_df = df1.merge(df2, on='loan_id', how='inner')
print(f"Merged dataset size: {merged_df.shape[0]}")

Merged dataset size: 671204


In [7]:
# Drop rows with missing values in key columns
required_columns = ['funded_amount_x', 'loan_amount_x', 'country_code_x', 'borrower_genders_x']
filtered_df = merged_df.dropna(subset=required_columns).copy()
print(f"Filtered dataset size: {filtered_df.shape[0]}")

# Check for differences between duplicated columns
columns_to_check = ['funded_amount', 'loan_amount', 'country_code', 'currency', 'partner_id',
                    'posted_time', 'tags', 'borrower_genders', 'repayment_interval']
for col in columns_to_check:
    print(f"{col}: {filtered_df[f'{col}_x'].equals(filtered_df[f'{col}_y'])}")

Filtered dataset size: 654651
funded_amount: False
loan_amount: False
country_code: False
currency: False
partner_id: False
posted_time: False
tags: False
borrower_genders: False
repayment_interval: False


In [10]:
# Remove discrepancies in categorical columns
filtered_df = filtered_df[
    (filtered_df['country_code_x'] == filtered_df['country_code_y']) &
    (filtered_df['currency_x'] == filtered_df['currency_y']) &
    (filtered_df['borrower_genders_x'] == filtered_df['borrower_genders_y']) &
    (filtered_df['repayment_interval_x'] == filtered_df['repayment_interval_y'])
]

# Keep only one version of duplicated columns
rename_cols = {f'{col}_x': col for col in columns_to_check}
filtered_df = filtered_df.rename(columns=rename_cols).drop(columns=[f'{col}_y' for col in columns_to_check])

# Drop unnecessary columns
filtered_df.drop(columns=['partner_id', 'tags', 'num_lenders_total', 'country'], inplace=True)

In [12]:
# Compute funded percentage and remove overfunded loans
filtered_df['funded_percentage'] = filtered_df['funded_amount'] / filtered_df['loan_amount']
filtered_df = filtered_df[filtered_df['funded_percentage'] <= 1]

# Merge with country stats
df_country = pd.read_csv(country_stats_file, usecols=['country_code', 'gni', 'population', 'hdi'])
final_df = filtered_df.merge(df_country, on='country_code', how='left')
print(final_df[['gni', 'population', 'hdi']].isna().sum())

# Encode gender as a dummy variable
final_df['gender_dummy'] = final_df['borrower_genders'].str.contains('female', case=False, na=False).astype(int)

gni           17039
population       19
hdi             159
dtype: int64


In [14]:
final_df.columns

Index(['loan_id', 'loan_name', 'original_language', 'description',
       'description_translated', 'funded_amount', 'loan_amount', 'status',
       'activity_name', 'sector_name', 'loan_use', 'country_code',
       'country_name', 'town_name', 'currency_policy',
       'currency_exchange_coverage_rate', 'currency', 'posted_time',
       'planned_expiration_time', 'disburse_time', 'raised_time',
       'lender_term', 'num_journal_entries', 'num_bulk_entries',
       'borrower_genders', 'borrower_pictured', 'repayment_interval',
       'distribution_model', 'activity', 'sector', 'use', 'region',
       'disbursed_time', 'funded_time', 'term_in_months', 'lender_count',
       'date', 'funded_percentage', 'population', 'hdi', 'gni',
       'gender_dummy'],
      dtype='object')

In [None]:
# Drop empty observations for HDI
final_df = final_df.dropna(subset=['hdi']) 

# Drop duplicate variables 
final_df = final_df.drop(columns=['activity', 'sector', 'use',])

In [26]:
# Drop duplicate variables 
final_df = final_df.drop(columns=['currency_policy', 'currency_exchange_coverage_rate', 'currency', 'num_journal_entries',
                                  'num_bulk_entries', 'distribution_model', 'region', ])

# Display the columns
final_df.columns

Index(['loan_id', 'loan_name', 'original_language', 'description',
       'description_translated', 'funded_amount', 'loan_amount', 'status',
       'activity_name', 'sector_name', 'loan_use', 'country_code',
       'country_name', 'town_name', 'posted_time', 'planned_expiration_time',
       'disburse_time', 'raised_time', 'lender_term', 'borrower_genders',
       'borrower_pictured', 'repayment_interval', 'disbursed_time',
       'funded_time', 'term_in_months', 'lender_count', 'date',
       'funded_percentage', 'population', 'hdi', 'gni', 'gender_dummy'],
      dtype='object')

In [28]:
# Calculate funding time in days
final_df['posted_time'] = pd.to_datetime(final_df['posted_time'])
final_df['raised_time'] = pd.to_datetime(final_df['raised_time'])
final_df['funding_time_days'] = (final_df['raised_time'] - final_df['posted_time']).dt.total_seconds() / (60 * 60 * 24)

In [30]:
final_df = final_df[final_df['funding_time_days'] > 0]  # Ensure no non-positive values
final_df['log_funding_time'] = np.log(final_df['funding_time_days'])

In [34]:
# Create sector fixed effects (one-hot encoding) and drop the first category
sector_dummies = pd.get_dummies(final_df['sector_name'], prefix='sector', drop_first=True)

# Add sector dummies to the dataset
final_df = pd.concat([final_df, sector_dummies], axis=1)

   loan_id       loan_name original_language  \
0   657307            Aivy           English   
1   657259  Idalia Marizza           Spanish   
2   658010           Aasia           English   
3   659347         Gulmira           Russian   
4   656933         Ricky\t           English   

                                         description  \
0  Aivy, 21 years of age, is single and lives in ...   
1  Doña Idalia, esta casada, tiene 57 años de eda...   
2  Aasia is a 45-year-old married lady and she ha...   
3  Гулмире 36 лет, замужем, вместе с супругом вос...   
4  Ricky is a farmer who currently cultivates his...   

                              description_translated  funded_amount  \
0                                                NaN          125.0   
1  Idalia, 57, is married and lives with her husb...          400.0   
2                                                NaN          400.0   
3  Gulmira is 36 years old and married.  She and ...          625.0   
4                  

In [46]:
# Rename final_df to data_df
data_df = final_df.copy()

# Define the directory path where you want to save the file
save_dir = "/Users/jakubriha/Desktop/ECO225/Project/Data"  # Ensure this directory exists

# Ensure the directory exists
import os
if not os.path.exists(save_dir):
    os.makedirs(save_dir)  # Creates the directory if it does not exist

# Save as CSV with a filename
save_path = os.path.join(save_dir, "data_df.csv")  # Append the filename
data_df.to_csv(save_path, index=False)

print(f"Dataset saved to: {save_path}")


Dataset saved to: /Users/jakubriha/Desktop/ECO225/Project/Data/data_df.csv
