# Computer Science Project

In [1]:
import pandas as pd
import numpy as np

#### 1. Normalize the *loan_lenders* table. In the normalized table, each row must have one *loan_id* and one *lender*.

In [2]:
loans_lenders = pd.read_csv("Datasets/loans_lenders.csv", nrows = 50000)

Normalizzare la tabella significa che ogni cella deve contenere una sola informazione. In questo caso la colonna *lender* possiede diversi nomi, quindi è necessario duplicare il valore di *loan_id* tante volte quante sono i lender.

In [8]:
loans_lenders

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."
...,...,...
995,497205,"nick1135, nick1135, marcusoh, scott57469017, c..."
996,590982,"jonathan7845, micah4814, ilia4401, angie6388, ..."
997,369988,"jeroen6107, sibylla8116, nicholas6826, jim5285..."
998,567092,"miikka2096, haleyconner, stephanie8503, obay16..."


In [3]:
loans_lenders["lenders"] = loans_lenders.lenders.apply(lambda x: x.split(","))

In [4]:
loans_lenders = loans_lenders.explode("lenders").reset_index(drop = True)

In [71]:
loans_lenders

Unnamed: 0,loan_id,lenders
0,483693,muc888
1,483693,sam4326
2,483693,camaran3922
3,483693,lachheb1865
4,483693,rebecca3499
...,...,...
31296,495013,joe5749
31297,495013,john9034
31298,495013,byron5129
31299,495013,staceyandgirls6384


#### 2. For each loan, add a column *duration* corresponding to the number of days between the *disburse time* and the *planned expiration time*. If any of those two dates is missing, also the *duration* must be missing.

In [93]:
loans = pd.read_csv("Datasets/loans.csv", nrows = 50000)

In [28]:
loans["disburse_time"] = pd.to_datetime(loans["disburse_time"])
loans["planned_expiration_time"] = pd.to_datetime(loans["planned_expiration_time"])

In [29]:
loans["duration"] = (loans["planned_expiration_time"] - loans["disburse_time"]).dt.days

#### 3. Find the lenders that have funded at least twice.

In [30]:
lenders = pd.read_csv("Datasets/lenders.csv", nrows = 50000)

In [103]:
lenders.describe()

Unnamed: 0,member_since,loan_purchase_num,num_invited
count,1000.0,909.0,1000.0
mean,1461311000.0,2.133113,0.018
std,8115.486,1.743109,0.147301
min,1461300000.0,0.0,0.0
25%,1461306000.0,1.0,0.0
50%,1461310000.0,2.0,0.0
75%,1461315000.0,3.0,0.0
max,1461346000.0,23.0,2.0


In [31]:
lenders.loc[lenders["loan_purchase_num"] >= 2]

Unnamed: 0,permanent_name,display_name,city,state,country_code,member_since,occupation,loan_because,loan_purchase_num,invited_by,num_invited
1,reena6733,Reena,,,,1461300634,,,9.0,,0
3,andrew86079135,Andrew,,,,1461301091,,,5.0,Peter Tan,0
6,rene7585,Rene,,,,1461301636,,,2.0,,0
7,harald2826,Harald,,,,1461301670,,,2.0,,0
11,jennifer4328,Jennifer,,,,1461302712,,,3.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
43264,pastordeborah9746,Pastor Deborah,,,,1366930818,,,2.0,,0
46977,chantal2707,Chantal,,,NL,1464546665,,,3.0,Keli Yen,1
47343,sandra9549,Sandra,,,,1446734718,,,2.0,,0
47813,greg7805,Greg,,,,1446738704,,,2.0,,0


#### 4. For each country, compute how many loans have involved that country as borrowers.

In [32]:
loans.groupby("country_name").count()["loan_id"]

country_name
Afghanistan       33
Albania          136
Armenia          527
Azerbaijan       292
Belize             8
                ... 
United States    241
Vietnam          783
Yemen            182
Zambia            42
Zimbabwe         179
Name: loan_id, Length: 83, dtype: int64

#### 5. For each country, compute the overall amount of money borrowed.

In [33]:
loans.groupby("country_name")["loan_amount"].sum()

country_name
Afghanistan        28025.0
Albania           182550.0
Armenia           887650.0
Azerbaijan        506450.0
Belize              6875.0
                   ...    
United States    1266675.0
Vietnam           943275.0
Yemen             170750.0
Zambia             70650.0
Zimbabwe          177775.0
Name: loan_amount, Length: 83, dtype: float64

#### 6. Like the previous point, but expressed as a percentage of the overall amount lent.

In [34]:
tot_loan_amount = loans["loan_amount"].sum()
tot_loan_amount

41939100.0

In [35]:
loans.groupby("country_name")["loan_amount"].sum() / tot_loan_amount * 100

country_name
Afghanistan      0.066823
Albania          0.435274
Armenia          2.116521
Azerbaijan       1.207584
Belize           0.016393
                   ...   
United States    3.020272
Vietnam          2.249154
Yemen            0.407138
Zambia           0.168459
Zimbabwe         0.423888
Name: loan_amount, Length: 83, dtype: float64

#### 7. Like the three previous points, but split for each year (with respect to *disburse time*).

In [36]:
loans["year"] = loans["disburse_time"].dt.year

In [37]:
loan_country = pd.DataFrame(loans.groupby(["year","country_name"])["loan_amount"].sum())

In [38]:
amount_per_year = pd.DataFrame(loans.groupby("year")["loan_amount"].sum())

In [39]:
res = pd.merge(loan_country, amount_per_year, on = "year", suffixes = ("", "_tot_year")).reset_index()

In [40]:
res["perc_per_year"] = res["loan_amount"]/res["loan_amount_tot_year"] * 100

In [41]:
res

Unnamed: 0,year,loan_amount,loan_amount_tot_year,perc_per_year
0,2006.0,2100.0,37750.0,5.562914
1,2006.0,5500.0,37750.0,14.569536
2,2006.0,2000.0,37750.0,5.298013
3,2006.0,6000.0,37750.0,15.894040
4,2006.0,5350.0,37750.0,14.172185
...,...,...,...,...
590,2018.0,775.0,17200.0,4.505814
591,2018.0,4300.0,17200.0,25.000000
592,2018.0,9525.0,17200.0,55.377907
593,2018.0,550.0,17200.0,3.197674


#### 8. For each lender, compute the overall amount of money lent. For each loan that has more than one lender, you must assume that all lenders contributed the same amount.

In [42]:
pd.merge(loans, loans_lenders, on = "loan_id").groupby("lenders")["loan_amount"].sum()

lenders
 0326lsw         250.0
 0819212       12875.0
 0919677        2550.0
 0983056        2225.0
 0li            2100.0
                ...   
zpedro4120       200.0
zsige            800.0
zussman4019      475.0
zuzana7522       600.0
zzaman          1025.0
Name: loan_amount, Length: 147371, dtype: float64

#### 9. For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed. Since the country of the lender is often unknown, you can assume that the true distribution among the countries is the same as the one computed from the rows where the country is known.

Per prima cosa ottengo la distribuzione dei paesi presenti all'interno del dataset lenders.

In [43]:
# Conto le presenze di ogni paese
d1 = pd.DataFrame(lenders.groupby("country_code")["country_code"].count())

In [44]:
d2 = d1.rename(columns = {"country_code": "num"}).reset_index()

In [45]:
# Calcolo la percentuale di presenza di ogni paese
prob = d2["num"] / d2["num"].sum()

In [46]:
# Sostituisco i valori mancanti con valori distribuiti secondo le proporzioni di probabilità ottenute alla riga precedente
lenders["country_code"].fillna(pd.Series(np.random.choice(d2["country_code"], 
                                                      p=prob, size=len(lenders))), inplace = True)

In [47]:
lenders["country_code"]

0        US
1        US
2        US
3        US
4        US
         ..
49995    US
49996    US
49997    US
49998    US
49999    US
Name: country_code, Length: 50000, dtype: object

Adesso calcolo il totale di soldi prestati

In [48]:
df1 = pd.merge(loans_lenders, loans, on = "loan_id")

In [49]:
df2 = pd.merge(df1, lenders, left_on = "lenders", right_on = "permanent_name", suffixes = ("_loaners","_lenders"))

In [50]:
money_borrowed = pd.DataFrame(df2.groupby("country_code_loaners")["loan_amount"].sum()).reset_index()

In [51]:
money_lent = pd.DataFrame(df2.groupby("country_code_lenders")["loan_amount"].sum()).reset_index()

In [52]:
money = pd.merge(money_borrowed, money_lent, left_on = "country_code_loaners", right_on = "country_code_lenders", suffixes = ("_loaners","_lenders"))
money["difference"] = money["loan_amount_loaners"] - money["loan_amount_lenders"]
money = money.rename(columns = {"country_code_loaners":"country_code"}).drop("country_code_lenders", axis = 1)
money

Unnamed: 0,country_code,loan_amount_loaners,loan_amount_lenders,difference
0,US,5000.0,12950.0,-7950.0


Valori positivi indicano che il paese ha ricevuto più di quanto ha prestato, viceversa significa che il paese ha prestato più soldi di quanto ha ricevuto

#### 10. Which country has the highest ratio between the difference computed at the previous point and the population?

In [53]:
country_stats = pd.read_csv("Datasets/country_stats.csv")

In [55]:
country = pd.merge(country_stats, money, on = "country_code")
country["ratio"] = country["difference"]/country["population"]
m = country["ratio"].idxmax()
country.loc[m]

country_name                        United States
country_code                                   US
country_code3                                 USA
continent                                Americas
region                           Northern America
population                              324459463
population_below_poverty_line                15.1
hdi                                      0.919553
life_expectancy                            79.222
expected_years_of_schooling               16.5382
mean_years_of_schooling                   13.2179
gni                                       53245.1
kiva_country_name                   United States
loan_amount_loaners                          5000
loan_amount_lenders                         12950
difference                                  -7950
ratio                                -2.45023e-05
Name: 0, dtype: object

#### 11. Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?

In [56]:
country["ratio2"] = country["difference"]/country["population_below_poverty_line"]
n = country["ratio2"].idxmax()
country.loc[n]

country_name                        United States
country_code                                   US
country_code3                                 USA
continent                                Americas
region                           Northern America
population                              324459463
population_below_poverty_line                15.1
hdi                                      0.919553
life_expectancy                            79.222
expected_years_of_schooling               16.5382
mean_years_of_schooling                   13.2179
gni                                       53245.1
kiva_country_name                   United States
loan_amount_loaners                          5000
loan_amount_lenders                         12950
difference                                  -7950
ratio                                -2.45023e-05
ratio2                                    -526.49
Name: 0, dtype: object

#### 12. For each year, compute the total amount of loans. Each loan that has planned expiration time and disburse time in different years must have its amount distributed proportionally to the number of days in each year. For example, a loan with disburse time December 1st, 2016, planned expiration time January 30th 2018, and amount 5000USD has an amount of 5000USD * 31 / (31+365+30) = 363.85 for 2016, 5000USD * 365 / (31+365+30) = 4284.04 for 2017, and 5000USD * 30 / (31+365+30) = 352.11 for 2018.

In [94]:
df = loans[["loan_amount", "disburse_time", "planned_expiration_time"]]

In [19]:
df["disburse_time_year"] = df["disburse_time"].dt.year
df["planned_year"] = df["planned_expiration_time"].dt.year
df["disburse_time_month"] = df["disburse_time"].dt.month
df["planned_month"] = df["planned_expiration_time"].dt.month
df["disburse_time_day"] = df["disburse_time"].dt.day
df["planned_day"] = df["planned_expiration_time"].dt.day
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,loan_amount,disburse_time,planned_expiration_time,disburse_time_year,planned_year,disburse_time_month,planned_month,disburse_time_day,planned_day
0,125.0,2013-12-22 08:00:00+00:00,2014-02-14 03:30:06+00:00,2013.0,2014.0,12.0,2.0,22.0,14.0
1,400.0,2013-12-20 08:00:00+00:00,2014-03-26 22:25:07+00:00,2013.0,2014.0,12.0,3.0,20.0,26.0
2,400.0,2014-01-09 08:00:00+00:00,2014-02-15 21:10:05+00:00,2014.0,2014.0,1.0,2.0,9.0,15.0
3,625.0,2014-01-17 08:00:00+00:00,2014-02-21 03:10:02+00:00,2014.0,2014.0,1.0,2.0,17.0,21.0
4,425.0,2013-12-17 08:00:00+00:00,2014-02-13 06:10:02+00:00,2013.0,2014.0,12.0,2.0,17.0,13.0
...,...,...,...,...,...,...,...,...,...
49995,1300.0,2009-02-27 08:00:00+00:00,NaT,2009.0,,2.0,,27.0,
49996,875.0,2009-02-24 08:00:00+00:00,NaT,2009.0,,2.0,,24.0,
49997,825.0,2009-03-20 07:00:00+00:00,NaT,2009.0,,3.0,,20.0,
49998,625.0,2009-03-19 07:00:00+00:00,NaT,2009.0,,3.0,,19.0,


In [120]:
df = loans[["loan_amount", "disburse_time", "planned_expiration_time"]]

In [115]:
df["planned_expiration_time"] = df["disburse_time"].fillna(df["planned_expiration_time"])
df["disburse_time"] = df["planned_expiration_time"].fillna(df["disburse_time"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [121]:
df["diff"] = pd.date_range(start = df["disburse_time"], end=df["planned_expiration_time"], periods = 3)

TypeError: Cannot convert input [0        2013-12-22 08:00:00.000 +0000
1        2013-12-20 08:00:00.000 +0000
2        2014-01-09 08:00:00.000 +0000
3        2014-01-17 08:00:00.000 +0000
4        2013-12-17 08:00:00.000 +0000
                     ...              
49995    2009-02-27 08:00:00.000 +0000
49996    2009-02-24 08:00:00.000 +0000
49997    2009-03-20 07:00:00.000 +0000
49998    2009-03-19 07:00:00.000 +0000
49999    2009-02-27 08:00:00.000 +0000
Name: disburse_time, Length: 50000, dtype: object] of type <class 'pandas.core.series.Series'> to Timestamp

In [118]:
df["diff"]

0        DatetimeIndex(['2013-12-22 08:00:00+00:00', '2...
1        DatetimeIndex(['2013-12-20 08:00:00+00:00', '2...
2        DatetimeIndex(['2014-01-09 08:00:00+00:00', '2...
3        DatetimeIndex(['2014-01-17 08:00:00+00:00', '2...
4        DatetimeIndex(['2013-12-17 08:00:00+00:00', '2...
                               ...                        
49995    DatetimeIndex(['2009-02-27 08:00:00+00:00', '2...
49996    DatetimeIndex(['2009-02-24 08:00:00+00:00', '2...
49997    DatetimeIndex(['2009-03-20 07:00:00+00:00', '2...
49998    DatetimeIndex(['2009-03-19 07:00:00+00:00', '2...
49999    DatetimeIndex(['2009-02-27 08:00:00+00:00', '2...
Name: diff, Length: 50000, dtype: object