# Computer Science Project

In [1]:
import pandas as pd
import numpy as np

#### 1. Normalize the *loan_lenders* table. In the normalized table, each row must have one *loan_id* and one *lender*.

In [68]:
loans_lenders = pd.read_csv("loans_lenders.csv", nrows = 1000)

Normalizzare la tabella significa che ogni cella deve contenere una sola informazione. In questo caso la colonna *lender* possiede diversi nomi, quindi è necessario duplicare il valore di *loan_id* tante volte quante sono i lender.

In [8]:
loans_lenders

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."
...,...,...
995,497205,"nick1135, nick1135, marcusoh, scott57469017, c..."
996,590982,"jonathan7845, micah4814, ilia4401, angie6388, ..."
997,369988,"jeroen6107, sibylla8116, nicholas6826, jim5285..."
998,567092,"miikka2096, haleyconner, stephanie8503, obay16..."


In [69]:
loans_lenders["lenders"] = loans_lenders.lenders.apply(lambda x: x.split(","))

In [70]:
loans_lenders = loans_lenders.explode("lenders").reset_index(drop = True)

In [71]:
loans_lenders

Unnamed: 0,loan_id,lenders
0,483693,muc888
1,483693,sam4326
2,483693,camaran3922
3,483693,lachheb1865
4,483693,rebecca3499
...,...,...
31296,495013,joe5749
31297,495013,john9034
31298,495013,byron5129
31299,495013,staceyandgirls6384


#### 2. For each loan, add a column *duration* corresponding to the number of days between the *disburse time* and the *planned expiration time*. If any of those two dates is missing, also the *duration* must be missing.

In [3]:
loans = pd.read_csv("loans.csv", nrows = 1000)

In [83]:
loans["disburse_time"] = pd.to_datetime(loans["disburse_time"])
loans["planned_expiration_time"] = pd.to_datetime(loans["planned_expiration_time"])

In [88]:
loans["duration"] = (loans["planned_expiration_time"] - loans["disburse_time"]).dt.days

#### 3. Find the lenders that have funded at least twice.

In [4]:
lenders = pd.read_csv("lenders.csv", nrows = 1000)

In [103]:
lenders.describe()

Unnamed: 0,member_since,loan_purchase_num,num_invited
count,1000.0,909.0,1000.0
mean,1461311000.0,2.133113,0.018
std,8115.486,1.743109,0.147301
min,1461300000.0,0.0,0.0
25%,1461306000.0,1.0,0.0
50%,1461310000.0,2.0,0.0
75%,1461315000.0,3.0,0.0
max,1461346000.0,23.0,2.0


In [105]:
lenders.loc[lenders["loan_purchase_num"] >= 2]

Unnamed: 0,permanent_name,display_name,city,state,country_code,member_since,occupation,loan_because,loan_purchase_num,invited_by,num_invited
1,reena6733,Reena,,,,1461300634,,,9.0,,0
3,andrew86079135,Andrew,,,,1461301091,,,5.0,Peter Tan,0
6,rene7585,Rene,,,,1461301636,,,2.0,,0
7,harald2826,Harald,,,,1461301670,,,2.0,,0
11,jennifer4328,Jennifer,,,,1461302712,,,3.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
990,luzelena5259,Luz Elena,,,,1461342164,,,4.0,,0
995,rosa8774,Rosa,,,,1461345496,,,5.0,,0
997,sridevi7185,SRIDEVI,,,,1461345622,,,2.0,,0
998,anshu8563,Anshu,,,US,1461345777,,,2.0,,0


#### 4. For each country, compute how many loans have involved that country as borrowers.

In [119]:
loans.groupby("country_name").count()["loan_id"]

country_name
Afghanistan       1
Albania           4
Armenia          12
Azerbaijan        4
Benin             5
                 ..
Ukraine           1
United States     6
Vietnam           9
Yemen             2
Zimbabwe          2
Name: loan_id, Length: 65, dtype: int64

#### 5. For each country, compute the overall amount of money borrowed.

In [121]:
loans.groupby("country_name")["loan_amount"].sum()

country_name
Afghanistan       1175.0
Albania           5550.0
Armenia          19225.0
Azerbaijan       10400.0
Benin             3950.0
                  ...   
Ukraine           1200.0
United States    29250.0
Vietnam          17650.0
Yemen             3250.0
Zimbabwe          2300.0
Name: loan_amount, Length: 65, dtype: float64

#### 6. Like the previous point, but expressed as a percentage of the overall amount lent.

In [122]:
tot_loan_amount = loans["loan_amount"].sum()
tot_loan_amount

857975.0

In [124]:
loans.groupby("country_name")["loan_amount"].sum() / tot_loan_amount * 100

country_name
Afghanistan      0.136950
Albania          0.646872
Armenia          2.240741
Azerbaijan       1.212157
Benin            0.460386
                   ...   
Ukraine          0.139864
United States    3.409190
Vietnam          2.057169
Yemen            0.378799
Zimbabwe         0.268073
Name: loan_amount, Length: 65, dtype: float64

#### 7. Like the three previous points, but split for each year (with respect to *disburse time*).

In [126]:
loans["year"] = loans["disburse_time"].dt.year

In [149]:
loan_country = pd.DataFrame(loans.groupby(["year","country_name"])["loan_amount"].sum())

In [135]:
amount_per_year = pd.DataFrame(loans.groupby("year")["loan_amount"].sum())

In [145]:
res = pd.merge(loan_country, amount_per_year, on = "year", suffixes = ("", "_tot_year")).reset_index()

In [147]:
res["perc_per_year"] = res["loan_amount"]/res["loan_amount_tot_year"] * 100

In [148]:
res

Unnamed: 0,year,loan_amount,loan_amount_tot_year,perc_per_year
0,2010,1175.0,41550.0,2.827918
1,2010,525.0,41550.0,1.263538
2,2010,3750.0,41550.0,9.025271
3,2010,4200.0,41550.0,10.108303
4,2010,1225.0,41550.0,2.948255
...,...,...,...,...
211,2017,800.0,21025.0,3.804994
212,2017,900.0,21025.0,4.280618
213,2017,1025.0,21025.0,4.875149
214,2017,4900.0,21025.0,23.305589


#### 8. For each lender, compute the overall amount of money lent. For each loan that has more than one lender, you must assume that all lenders contributed the same amount.

In [159]:
pd.merge(loans, loans_lenders, on = "loan_id").groupby("lenders")["loan_amount"].sum()

lenders
 WeWishWeWereInMaui    4625.0
 abi3261               1000.0
 abigail6217           4625.0
 adam6350              4625.0
 adam8512              4625.0
                        ...  
 zoe13765053            575.0
ba3127                 4625.0
catherine1576           650.0
edwin9421               575.0
kennethheal            1000.0
Name: loan_amount, Length: 205, dtype: float64

#### 9. For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed. Since the country of the lender is often unknown, you can assume that the true distribution among the countries is the same as the one computed from the rows where the country is known.

Per prima cosa ottengo la distribuzione dei paesi presenti all'interno del dataset lenders.

In [53]:
# Conto le presenze di ogni paese
d1 = pd.DataFrame(lenders.groupby("country_code")["country_code"].count())

In [54]:
d2 = d1.rename(columns = {"country_code": "num"}).reset_index()

In [55]:
# Calcolo la percentuale di presenza di ogni paese
prob = d2["num"] / d2["num"].sum()

In [49]:
import numpy as np

In [58]:
# Sostituisco i valori mancanti con valori distribuiti secondo le proporzioni di probabilità ottenute alla riga precedente
lenders["country_code"].fillna(pd.Series(np.random.choice(d2["country_code"], 
                                                      p=prob, size=len(lenders))), inplace = True)

In [62]:
lenders["country_code"]

0      IT
1      US
2      GB
3      PL
4      US
       ..
995    GB
996    US
997    US
998    US
999    AT
Name: country_code, Length: 1000, dtype: object

Adesso calcolo il totale di soldi prestati

In [87]:
df1 = pd.merge(loans_lenders, loans, on = "loan_id")

In [88]:
df2 = pd.merge(df1, lenders, left_on = "lenders", right_on = "permanent_name")

In [89]:
df2

Unnamed: 0,loan_id,lenders,loan_name,original_language,description,description_translated,funded_amount,loan_amount,status,activity_name,...,display_name,city,state,country_code_y,member_since,occupation,loan_because,loan_purchase_num,invited_by,num_invited


Which country has the highest ratio between the difference computed at the previous point and the population?
Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?
For each year, compute the total amount of loans. Each loan that has planned expiration time and disburse time in different years must have its amount distributed proportionally to the number of days in each year. For example, a loan with disburse time December 1st, 2016, planned expiration time January 30th 2018, and amount 5000USD has an amount of 5000USD * 31 / (31+365+30) = 363.85 for 2016, 5000USD * 365 / (31+365+30) = 4284.04 for 2017, and 5000USD * 30 / (31+365+30) = 352.11 for 2018.