## 1. Import libraries and data

In [65]:
import pandas as pd

predicted_damage_amount = pd.read_csv("./data/exported/predicted_damage_amount.csv")
predicted_damage_incident = pd.read_csv("./data/exported/predicted_damage_incident.csv")
predicted_revenue = pd.read_csv("./data/exported/predicted_revenue.csv")

## 2. Merge 3 exported dataframes into 1

In [66]:
merged_df = pd.merge(predicted_damage_amount, predicted_damage_incident, on="applicant_id")
merged_df = pd.merge(merged_df, predicted_revenue, on="applicant_id")

### 2.1 Fix mistake in DataFrame

The models I used to predict the `predicted_damage_amount` and `predicted_damage_incident` do not line up completely.

Sometimes the `predicted_damage_incident` == 0, but the `predicted_damage_amount` is not 0 (but it should be). For example here:

In [67]:
merged_df[merged_df["predicted_damage_incident"] == 0].head(5)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue
0,0,188.492,0,2006.474841
1,1,159.362,0,2115.739438
3,3,0.0,0,1594.57045
4,4,489.434,0,1470.689027
5,5,0.0,0,1430.706213


#### **Find out how many it got wrong**

select from df the columns where `predicted_damage_incident` is 0, but `predicted_damage_amount` is NOT 0




In [68]:
selected_columns = merged_df[
    (merged_df['predicted_damage_incident'] == 0) 
    & (merged_df['predicted_damage_amount'] != 0)]

In [69]:
selected_columns.shape

(327, 4)

it got 327 wrong:
- 327 clients got marked with 0 for predicted_damage_inc,
- but they have a predicted damage amount

#### **Manually fix the mistake**


- If we set the `predicted_damage_incident` to 1, if the `predicted_damage_amount` > $0.00, we get a ratio of approx. 68.4%
  - 0 = 158
  - 1 = 342
  - `342` / (`342`+`158`) = 0.684
- If we set the `predicted_damage_amount` to $0.00, if the `predicted_damage_incident` == 0, we get a ratio of approx. 3% 
  - 0 = 485
  - 1 = 15
  - `15` / (`15`+`485`) = 0.03


**Solution**
- 1st approach : 68.4% of applicants **will damage** the room.
- 2nd approach : 3% of applicants **will not damage** the room

The training data had a ratio of approx. 30%, and the 2nd approach is closer, and we will assume that our best applicants will not damage the room

In [70]:
merged_df.loc[merged_df["predicted_damage_incident"] == 0, "predicted_damage_amount"] = 0
merged_df[merged_df["predicted_damage_incident"] == 0].head(10)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue
0,0,0.0,0,2006.474841
1,1,0.0,0,2115.739438
3,3,0.0,0,1594.57045
4,4,0.0,0,1470.689027
5,5,0.0,0,1430.706213
6,6,0.0,0,1510.264366
8,8,0.0,0,3029.583549
9,9,0.0,0,1590.677155
10,10,0.0,0,1705.867259
11,11,0.0,0,1534.834832


In [71]:
merged_df["predicted_damage_incident"].value_counts()

predicted_damage_incident
0    485
1     15
Name: count, dtype: int64

## 3. Calculate predicted profit per guest

`predicted_profit = predicted_revenue - predicted_damage_amount`

In [72]:
merged_df["predicted_profit"] = merged_df["predicted_revenue"] - merged_df["predicted_damage_amount"]

In [73]:
with_profit = merged_df.sort_values(by="predicted_profit", ascending=False)

In [74]:
with_profit.head()

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit
100,100,0.0,0,5400.725472,5400.725472
256,256,0.0,0,5256.598754,5256.598754
431,431,0.0,0,4991.07925,4991.07925
199,199,0.0,0,4462.880685,4462.880685
154,154,0.0,0,4250.387703,4250.387703


## 04. Check if my results are even possible

The predicted profits seem very high (several thousands of dollars)

### **Checking the highest `profit_last_am` in train_V2.csv**

In [75]:
train = pd.read_csv("./data/train_V2.csv")

train[["profit_last_am"]] \
    .sort_values(by="profit_last_am", ascending=False) \
    .head(10)

Unnamed: 0,profit_last_am
3763,150537.0
866,56086.0
2212,53989.0
1563,39328.0
2493,37055.5
3593,34814.5
4818,27807.0
3984,26764.5
4512,26721.0
1986,25257.0


`.describe()`

In [76]:
train[["profit_last_am"]].describe()

Unnamed: 0,profit_last_am
count,4947.0
mean,696.057712
std,3051.119275
min,0.0
25%,0.0
50%,52.0
75%,810.0
max,150537.0


## 5. Export to CSV

In [25]:
merged_df.to_csv("./data/exported/merged.csv", index=False)