# Computer Science Project

In [1]:
import pandas as pd
import numpy as np

#### 1. Normalize the *loan_lenders* table. In the normalized table, each row must have one *loan_id* and one *lender*.

In [2]:
loans_lenders = pd.read_csv("Datasets/loans_lenders.csv")

Normalizzare la tabella significa che ogni cella deve contenere una sola informazione. In questo caso la colonna *lender* possiede diversi nomi, quindi è necessario duplicare il valore di *loan_id* tante volte quante sono i lender.

In [3]:
loans_lenders

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."
...,...,...
1387427,678999,"michael43411218, carol5987, gooddogg1, chris41..."
1387428,1207353,"rjhoward1986, jeffrey6870, trolltech4460, elys..."
1387429,1206220,"vicky7746, gooddogg1, fairspirit, craig9729960..."
1387430,1206425,"rich6705, sergiiy9766, angela7509, barbara5610..."


In [4]:
loans_lenders["lenders"] = loans_lenders.lenders.apply(lambda x: x.split(","))

In [5]:
loans_lenders = loans_lenders.explode("lenders").reset_index(drop = True)

In [6]:
loans_lenders

Unnamed: 0,loan_id,lenders
0,483693,muc888
1,483693,sam4326
2,483693,camaran3922
3,483693,lachheb1865
4,483693,rebecca3499
...,...,...
28293926,1206425,trogdorfamily7622
28293927,1206425,danny6470
28293928,1206425,don6118
28293929,1206486,alan5175


#### 2. For each loan, add a column *duration* corresponding to the number of days between the *disburse time* and the *planned expiration time*. If any of those two dates is missing, also the *duration* must be missing.

In [7]:
loans = pd.read_csv("Datasets/loans.csv")

In [8]:
loans["disburse_time"] = pd.to_datetime(loans["disburse_time"])
loans["planned_expiration_time"] = pd.to_datetime(loans["planned_expiration_time"])

In [9]:
loans["duration"] = (loans["planned_expiration_time"] - loans["disburse_time"]).dt.days

In [10]:
loans["duration"]

0          53.0
1          96.0
2          37.0
3          34.0
4          57.0
           ... 
1419602    39.0
1419603    39.0
1419604    51.0
1419605    63.0
1419606    61.0
Name: duration, Length: 1419607, dtype: float64

#### 3. Find the lenders that have funded at least twice.

In [11]:
lenders = pd.read_csv("Datasets/lenders.csv")

In [12]:
lenders.describe()

Unnamed: 0,member_since,loan_purchase_num,num_invited
count,2349174.0,1454893.0,2349174.0
mean,1352954000.0,17.23265,0.2253443
std,96436170.0,185.3678,16.91738
min,1136106000.0,0.0,0.0
25%,1268534000.0,1.0,0.0
50%,1364078000.0,2.0,0.0
75%,1429845000.0,9.0,0.0
max,1515617000.0,85190.0,24854.0


In [13]:
lenders.loc[lenders["loan_purchase_num"] >= 2]

Unnamed: 0,permanent_name,display_name,city,state,country_code,member_since,occupation,loan_because,loan_purchase_num,invited_by,num_invited
1,reena6733,Reena,,,,1461300634,,,9.0,,0
3,andrew86079135,Andrew,,,,1461301091,,,5.0,Peter Tan,0
6,rene7585,Rene,,,,1461301636,,,2.0,,0
7,harald2826,Harald,,,,1461301670,,,2.0,,0
11,jennifer4328,Jennifer,,,,1461302712,,,3.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
2349158,rakhi,Rakhi,New York,New York,US,1342100607,Student,I care.,4.0,,0
2349162,james75291930,James,,,,1342096854,,,6.0,,0
2349164,carol8279,Carol,,,,1342099416,,,5.0,,0
2349166,eric91401545,Eric,,,,1342100719,,,2.0,,0


#### 4. For each country, compute how many loans have involved that country as borrowers.

In [14]:
loans.groupby("country_name").count()["loan_id"]

country_name
Afghanistan        2337
Albania            3075
Armenia           13952
Azerbaijan        10172
Belize              218
                  ...  
Vietnam           21839
Virgin Islands        2
Yemen              4206
Zambia             1277
Zimbabwe           5513
Name: loan_id, Length: 96, dtype: int64

#### 5. For each country, compute the overall amount of money borrowed.

In [15]:
loans.groupby("country_name")["loan_amount"].sum()

country_name
Afghanistan        1967950.0
Albania            4307350.0
Armenia           22950475.0
Azerbaijan        14784625.0
Belize              150175.0
                     ...    
Vietnam           24681100.0
Virgin Islands       10000.0
Yemen              3444000.0
Zambia             1978975.0
Zimbabwe           5851875.0
Name: loan_amount, Length: 96, dtype: float64

#### 6. Like the previous point, but expressed as a percentage of the overall amount lent.

In [16]:
tot_loan_amount = loans["loan_amount"].sum()
tot_loan_amount

1181437300.0

In [42]:
d = loans.groupby("country_name")["loan_amount"].sum() / tot_loan_amount * 100
d

country_name
Afghanistan       0.166573
Albania           0.364586
Armenia           1.942589
Azerbaijan        1.251410
Belize            0.012711
                    ...   
Vietnam           2.089074
Virgin Islands    0.000846
Yemen             0.291509
Zambia            0.167506
Zimbabwe          0.495318
Name: loan_amount, Length: 96, dtype: float64

In [43]:
d.sum()

100.0

#### 7. Like the three previous points, but split for each year (with respect to *disburse time*).

In [18]:
loans["year"] = loans["disburse_time"].dt.year

In [19]:
loan_country = pd.DataFrame(loans.groupby(["year","country_name"])["loan_amount"].sum())

In [20]:
amount_per_year = pd.DataFrame(loans.groupby("year")["loan_amount"].sum())

In [21]:
res = pd.merge(loan_country, amount_per_year, on = "year", suffixes = ("", "_tot_year")).reset_index()

In [22]:
res["perc_per_year"] = res["loan_amount"]/res["loan_amount_tot_year"] * 100

In [23]:
res

Unnamed: 0,year,loan_amount,loan_amount_tot_year,perc_per_year
0,2005.0,1850.0,102850.0,1.798736
1,2005.0,1625.0,102850.0,1.579971
2,2005.0,5000.0,102850.0,4.861449
3,2005.0,5000.0,102850.0,4.861449
4,2005.0,33300.0,102850.0,32.377248
...,...,...,...,...
743,2018.0,14300.0,990025.0,1.444408
744,2018.0,26500.0,990025.0,2.676700
745,2018.0,21750.0,990025.0,2.196914
746,2018.0,8000.0,990025.0,0.808060


In [44]:
res.groupby("year")["perc_per_year"].sum()

year
2005.0    100.0
2006.0    100.0
2007.0    100.0
2008.0    100.0
2009.0    100.0
2010.0    100.0
2011.0    100.0
2012.0    100.0
2013.0    100.0
2014.0    100.0
2015.0    100.0
2016.0    100.0
2017.0    100.0
2018.0    100.0
Name: perc_per_year, dtype: float64

#### 8. For each lender, compute the overall amount of money lent. For each loan that has more than one lender, you must assume that all lenders contributed the same amount.

In [45]:
a = pd.merge(loans, loans_lenders, on = "loan_id")
a = a[["loan_id", "loan_amount", "lenders"]]

Unnamed: 0,loan_id,loan_name,original_language,description,description_translated,funded_amount,loan_amount,status,activity_name,sector_name,...,num_bulk_entries,tags,borrower_genders,borrower_pictured,repayment_interval,distribution_model,duration,year,percentage,lenders
0,657307,Aivy,English,"Aivy, 21 years of age, is single and lives in ...",,125.0,125.0,funded,General Store,Retail,...,1,,female,true,irregular,field_partner,53.0,2013.0,,spencer5657
1,657307,Aivy,English,"Aivy, 21 years of age, is single and lives in ...",,125.0,125.0,funded,General Store,Retail,...,1,,female,true,irregular,field_partner,53.0,2013.0,,matthew8640
2,657307,Aivy,English,"Aivy, 21 years of age, is single and lives in ...",,125.0,125.0,funded,General Store,Retail,...,1,,female,true,irregular,field_partner,53.0,2013.0,,larry71496105
3,657259,Idalia Marizza,Spanish,"Doña Idalia, esta casada, tiene 57 años de eda...","Idalia, 57, is married and lives with her husb...",400.0,400.0,funded,Used Clothing,Clothing,...,1,,female,true,monthly,field_partner,96.0,2013.0,,ltr
4,657259,Idalia Marizza,Spanish,"Doña Idalia, esta casada, tiene 57 años de eda...","Idalia, 57, is married and lives with her husb...",400.0,400.0,funded,Used Clothing,Clothing,...,1,,female,true,monthly,field_partner,96.0,2013.0,,andrew5306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28293907,989240,Lydia,French,Lydia a 37ans et habite dans une zone rurale. ...,Lydia is 37 years old and lives in a rural are...,175.0,175.0,funded,Sewing,Services,...,1,,female,true,monthly,field_partner,61.0,2015.0,,jensdamsgaardvanar
28293908,989240,Lydia,French,Lydia a 37ans et habite dans une zone rurale. ...,Lydia is 37 years old and lives in a rural are...,175.0,175.0,funded,Sewing,Services,...,1,,female,true,monthly,field_partner,61.0,2015.0,,david47598776
28293909,989240,Lydia,French,Lydia a 37ans et habite dans une zone rurale. ...,Lydia is 37 years old and lives in a rural are...,175.0,175.0,funded,Sewing,Services,...,1,,female,true,monthly,field_partner,61.0,2015.0,,bernd9221
28293910,989240,Lydia,French,Lydia a 37ans et habite dans une zone rurale. ...,Lydia is 37 years old and lives in a rural are...,175.0,175.0,funded,Sewing,Services,...,1,,female,true,monthly,field_partner,61.0,2015.0,,valeria98599473


In [49]:
b = a.groupby("loan_id").count()["lenders"]

In [55]:
c = pd.DataFrame(b).rename(columns = {"lenders":"num"}).reset_index()

In [56]:
d = pd.merge(a, c, on = "loan_id") 

In [58]:
d[["loan_id", "num", "loan_amount","lenders"]]

Unnamed: 0,loan_id,num,loan_amount,lenders
0,657307,3,125.0,spencer5657
1,657307,3,125.0,matthew8640
2,657307,3,125.0,larry71496105
3,657259,7,400.0,ltr
4,657259,7,400.0,andrew5306
...,...,...,...,...
28293907,989240,5,175.0,jensdamsgaardvanar
28293908,989240,5,175.0,david47598776
28293909,989240,5,175.0,bernd9221
28293910,989240,5,175.0,valeria98599473


In [59]:
d["true_amount"] = d["loan_amount"]/d["num"]

In [60]:
d.groupby("lenders")["true_amount"].sum()

lenders
 000               1672.618411
 00000             1380.693644
 0002              2472.563566
 00mike00            52.631579
 0101craign0101    2623.565117
                      ...     
zzanita              87.500000
zzcyna7269           55.357143
zzinnia              38.000000
zzmcfate           2287.291955
zzrvmf8538          106.417625
Name: true_amount, Length: 1639026, dtype: float64

#### 9. For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed. Since the country of the lender is often unknown, you can assume that the true distribution among the countries is the same as the one computed from the rows where the country is known.

Per prima cosa ottengo la distribuzione dei paesi presenti all'interno del dataset lenders.

In [25]:
# Conto le presenze di ogni paese
d1 = pd.DataFrame(lenders.groupby("country_code")["country_code"].count())

In [26]:
d2 = d1.rename(columns = {"country_code": "num"}).reset_index()

In [27]:
# Calcolo la percentuale di presenza di ogni paese
prob = d2["num"] / d2["num"].sum()

In [28]:
# Sostituisco i valori mancanti con valori distribuiti secondo le proporzioni di probabilità ottenute alla riga precedente
lenders["country_code"].fillna(pd.Series(np.random.choice(d2["country_code"], 
                                                      p=prob, size=len(lenders))), inplace = True)

In [29]:
lenders["country_code"]

0          US
1          US
2          US
3          US
4          US
           ..
2349169    US
2349170    US
2349171    US
2349172    US
2349173    US
Name: country_code, Length: 2349174, dtype: object

Adesso calcolo il totale di soldi prestati

In [30]:
df1 = pd.merge(loans_lenders, loans, on = "loan_id")

In [31]:
df2 = pd.merge(df1, lenders, left_on = "lenders", right_on = "permanent_name", suffixes = ("_loaners","_lenders"))

In [32]:
money_borrowed = pd.DataFrame(df2.groupby("country_code_loaners")["loan_amount"].sum()).reset_index()

In [33]:
money_lent = pd.DataFrame(df2.groupby("country_code_lenders")["loan_amount"].sum()).reset_index()

In [34]:
money = pd.merge(money_borrowed, money_lent, left_on = "country_code_loaners", right_on = "country_code_lenders", suffixes = ("_loaners","_lenders"))
money["difference"] = money["loan_amount_loaners"] - money["loan_amount_lenders"]
money = money.rename(columns = {"country_code_loaners":"country_code"}).drop("country_code_lenders", axis = 1)
money

Unnamed: 0,country_code,loan_amount_loaners,loan_amount_lenders,difference
0,AF,1966925.0,96750.0,1870175.0
1,AL,4283750.0,30775.0,4252975.0
2,AM,22901100.0,35050.0,22866050.0
3,AZ,14749025.0,4650.0,14744375.0
4,BA,476275.0,94225.0,382050.0
...,...,...,...,...
86,XK,3083025.0,8775.0,3074250.0
87,YE,3440250.0,32200.0,3408050.0
88,ZA,1006525.0,683725.0,322800.0
89,ZM,1975075.0,18425.0,1956650.0


Valori positivi indicano che il paese ha ricevuto più di quanto ha prestato, viceversa significa che il paese ha prestato più soldi di quanto ha ricevuto

#### 10. Which country has the highest ratio between the difference computed at the previous point and the population?

In [35]:
country_stats = pd.read_csv("Datasets/country_stats.csv")

In [36]:
country = pd.merge(country_stats, money, on = "country_code")
country["ratio"] = country["difference"]/country["population"]
m = country["ratio"].idxmax()
country.loc[m]

country_name                           Samoa
country_code                              WS
country_code3                            WSM
continent                            Oceania
region                             Polynesia
population                            196440
population_below_poverty_line            NaN
hdi                                    0.702
life_expectancy                          NaN
expected_years_of_schooling              NaN
mean_years_of_schooling                  NaN
gni                                      NaN
kiva_country_name                      Samoa
loan_amount_loaners              1.21832e+07
loan_amount_lenders                    14675
difference                       1.21685e+07
ratio                                61.9453
Name: 84, dtype: object

#### 11. Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?

In [37]:
country["ratio2"] = country["difference"]/country["population_below_poverty_line"]
n = country["ratio2"].idxmax()
country.loc[n]

country_name                            Philippines
country_code                                     PH
country_code3                                   PHL
continent                                      Asia
region                           South-eastern Asia
population                                104918090
population_below_poverty_line                  21.6
hdi                                        0.681799
life_expectancy                               68.34
expected_years_of_schooling                  11.733
mean_years_of_schooling                     9.32687
gni                                         8395.09
kiva_country_name                       Philippines
loan_amount_loaners                     9.06737e+07
loan_amount_lenders                     1.47018e+06
difference                              8.92035e+07
ratio                                      0.850221
ratio2                                  4.12979e+06
Name: 8, dtype: object

#### 12. For each year, compute the total amount of loans. Each loan that has planned expiration time and disburse time in different years must have its amount distributed proportionally to the number of days in each year. For example, a loan with disburse time December 1st, 2016, planned expiration time January 30th 2018, and amount 5000USD has an amount of 5000USD * 31 / (31+365+30) = 363.85 for 2016, 5000USD * 365 / (31+365+30) = 4284.04 for 2017, and 5000USD * 30 / (31+365+30) = 352.11 for 2018.

In [128]:
import datetime

In [81]:
df = loans[["loan_id","loan_amount", "disburse_time", "planned_expiration_time"]]

In [218]:
same_year = df[df["disburse_time"].dt.year == df["planned_expiration_time"].dt.year]
diff_year = df[df["disburse_time"].dt.year != df["planned_expiration_time"].dt.year]

Calcoliamo il risultato per same_year

In [181]:
same_year["year"] = same_year["disburse_time"].dt.year
same = pd.DataFrame(same_year.groupby("year")["loan_amount"].sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Puliamo il dataset da eventuali valori nulli e date insensate

In [219]:
diff_year = diff_year.loc[(diff_year["planned_expiration_time"].isnull() == False) | (diff_year["disburse_time"].isnull() == False)]
diff_year = diff_year.loc[diff_year["planned_expiration_time"] > diff_year["disburse_time"]]

Adesso ripetiamo le righe per numero di anni passati

In [220]:
diff_year = diff_year.loc[diff_year.index.repeat(diff_year["planned_expiration_time"].dt.year - diff_year["disburse_time"].dt.year + 1)]

Creiamo due nuove colonne, una con la fine dell'anno e l'altra con l'inizio dell'anno

In [184]:
#diff_year["start"] = diff_year["planned_expiration_time"].dt.year.apply(lambda x: datetime.datetime(x, 1, 1))
#diff_year["end"] = diff_year["disburse_time"].dt.year.apply(lambda x: datetime.datetime(x, 12, 31))

In [227]:
diff_year["count"] = diff_year.groupby("loan_id").cumcount()
diff_year["count_year"] = pd.to_datetime(diff_year.groupby("loan_id").cumcount() + diff_year["disburse_time"].dt.year, format = "%Y")

In [222]:
diff_year["end_year"] = diff_year["disburse_time"].dt.year.apply(lambda x: datetime.datetime(x, 12, 31))

In [223]:
diff_year["planned_expiration_time"] = diff_year["planned_expiration_time"].dt.tz_localize(None)
diff_year["disburse_time"] = diff_year["disburse_time"].dt.tz_localize(None)

In [224]:
diff_year = diff_year.reset_index()

In [225]:
diff_year.loc[diff_year["count_year"].dt.year == diff_year["disburse_time"].dt.year, "useful_days"] = (diff_year["end_year"] - diff_year["disburse_time"]).dt.days
diff_year.loc[diff_year["count_year"].dt.year != diff_year["disburse_time"].dt.year, "useful_days"] = (diff_year["planned_expiration_time"] - diff_year["count_year"]).dt.days

In [238]:
diff_year.loc[diff_year["count"] >= 2, "useful_days"] = diff_year["count_year"].apply(lambda x: pd.Period(x, freq = "D").dayofyear)

In [242]:
diff_year.loc[diff_year["useful_days"] > 300]

Unnamed: 0,index,loan_id,loan_amount,disburse_time,planned_expiration_time,count,count_year,end_year,useful_days
6765,33780,1077942,650.0,2011-12-22 16:49:11,2016-06-29 01:46:30,1,2012-01-01,2011-12-31,1641.0
6771,33781,1078000,525.0,2012-05-07 23:55:05,2016-06-29 01:47:03,1,2013-01-01,2012-12-31,1275.0
6780,33859,1078227,600.0,2012-09-05 15:56:45,2016-06-29 01:48:52,1,2013-01-01,2012-12-31,1275.0
6785,33923,1078038,250.0,2012-06-28 22:31:29,2016-06-29 01:47:21,1,2013-01-01,2012-12-31,1275.0
6794,34072,1077950,5000.0,2011-12-21 17:55:42,2016-06-29 01:46:35,1,2012-01-01,2011-12-31,1641.0
...,...,...,...,...,...,...,...,...,...
297015,1376686,1078247,100.0,2012-10-02 17:05:42,2016-06-29 01:49:03,1,2013-01-01,2012-12-31,1275.0
299762,1397645,1078246,100.0,2012-10-12 18:08:56,2016-06-29 01:49:02,1,2013-01-01,2012-12-31,1275.0
299793,1397865,1078133,50.0,2012-06-28 20:57:02,2016-06-29 01:48:07,1,2013-01-01,2012-12-31,1275.0
299832,1398082,1078002,525.0,2012-05-07 23:54:15,2016-06-29 01:47:04,1,2013-01-01,2012-12-31,1275.0


In [163]:
conto = pd.DataFrame(diff_year.groupby("loan_id").count()["start"]).rename(columns = {"start":"num"})
diff_year = pd.merge(diff_year, conto, on = "loan_id")

In [164]:
diff_year["year_complete"] = diff_year["num"] - 2

Dato che la timezone nei punti successivi potrebbe creare problemi la eliminiamo

In [169]:
diff_year["planned_expiration_time"] = diff_year["planned_expiration_time"].dt.tz_localize(None)
diff_year["disburse_time"] = diff_year["disburse_time"].dt.tz_localize(None)

In [170]:
diff_year["day_until_end_year"] = (diff_year["end"] - diff_year["disburse_time"]).dt.days
diff_year["day_since_start_year"] = (diff_year["planned_expiration_time"] - diff_year["start"]).dt.days

In [179]:
diff_year

Unnamed: 0,loan_id,loan_amount,disburse_time,planned_expiration_time,start,end,num,year_complete,day_until_end_year,day_since_start_year
0,657307,125.0,2013-12-22 08:00:00,2014-02-14 03:30:06,2014-01-01,2013-12-31,2,0,8,44
1,657307,125.0,2013-12-22 08:00:00,2014-02-14 03:30:06,2014-01-01,2013-12-31,2,0,8,44
2,657259,400.0,2013-12-20 08:00:00,2014-03-26 22:25:07,2014-01-01,2013-12-31,2,0,10,84
3,657259,400.0,2013-12-20 08:00:00,2014-03-26 22:25:07,2014-01-01,2013-12-31,2,0,10,84
4,656933,425.0,2013-12-17 08:00:00,2014-02-13 06:10:02,2014-01-01,2013-12-31,2,0,13,43
...,...,...,...,...,...,...,...,...,...,...
302198,989109,2425.0,2015-11-13 08:00:00,2016-01-03 22:20:04,2016-01-01,2015-12-31,2,0,47,2
302199,989143,100.0,2015-11-03 08:00:00,2016-01-05 08:50:02,2016-01-01,2015-12-31,2,0,57,4
302200,989143,100.0,2015-11-03 08:00:00,2016-01-05 08:50:02,2016-01-01,2015-12-31,2,0,57,4
302201,989240,175.0,2015-11-03 08:00:00,2016-01-03 20:50:06,2016-01-01,2015-12-31,2,0,57,2
