## 1. Import libraries and data

In [1]:
import pandas as pd

predicted_damage_amount = pd.read_csv("./data/exported/predicted_damage_amount.csv")
predicted_damage_incident = pd.read_csv("./data/exported/predicted_damage_incident.csv")
predicted_revenue = pd.read_csv("./data/exported/predicted_revenue.csv")

## 2. Merge 3 exported dataframes into 1

In [2]:
merged_df = pd.merge(predicted_damage_amount, predicted_damage_incident, on="applicant_id")
merged_df = pd.merge(merged_df, predicted_revenue, on="applicant_id")

### 2.1 Fix mistake in DataFrame

The models I used to predict the `predicted_damage_amount` and `predicted_damage_incident` do not line up completely.

Sometimes the `predicted_damage_incident` == 0, but the `predicted_damage_amount` is not 0 (but it should be). For example here:

In [3]:
merged_df[merged_df["predicted_damage_incident"] == 0].head(5)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue
0,0,188.492,0,2125.775778
1,1,159.362,0,2253.402897
2,2,255.394,0,-22.878184
3,3,0.0,0,1762.829728
4,4,240.656,0,1542.663912


#### **Find out how many it got wrong**

select from df the columns where `predicted_damage_incident` is 0, but `predicted_damage_amount` is NOT 0




In [4]:
selected_columns = merged_df[
    (merged_df['predicted_damage_incident'] == 0) 
    & (merged_df['predicted_damage_amount'] != 0)]

it got 176 wrong:
- 176 clients got marked with 0 for predicted_damage_inc,
- but they have a predicted damage amount

In [5]:
selected_columns.shape

(304, 4)

#### **Manually fix the mistake**

If a guests `predicted_damage_incident` == 0, we can set the `predicted_damage_amount` to 0, because that's how it is in the base datasets I got 

In [6]:
merged_df.loc[merged_df["predicted_damage_incident"] == 0, "predicted_damage_amount"] = 0
merged_df[merged_df["predicted_damage_incident"] == 0].head(10)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue
0,0,0.0,0,2125.775778
1,1,0.0,0,2253.402897
2,2,0.0,0,-22.878184
3,3,0.0,0,1762.829728
4,4,0.0,0,1542.663912
5,5,0.0,0,1619.318553
6,6,0.0,0,1571.448494
7,7,0.0,0,2237.384491
8,8,0.0,0,3345.808522
9,9,0.0,0,1630.836319


## 3. Calculate predicted profit per guest

`predicted_profit = predicted_revenue - predicted_damage_amount`

In [7]:
merged_df["predicted_profit"] = merged_df["predicted_revenue"] - merged_df["predicted_damage_amount"]

In [8]:
with_profit = merged_df.sort_values(by="predicted_profit", ascending=False)

In [9]:
with_profit.head()

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit
100,100,0.0,0,6485.252989,6485.252989
256,256,0.0,0,5949.154542,5949.154542
431,431,0.0,0,5282.00592,5282.00592
199,199,0.0,0,4735.104834,4735.104834
154,154,0.0,0,4656.904732,4656.904732


## 04. Check if my results are even possible

The predicted profits seem very high (several thousands of dollars)

### **Checking the highest `profit_last_am` in train_V2.csv**

In [10]:
train = pd.read_csv("./data/train_V2.csv")

train[["profit_last_am"]] \
    .sort_values(by="profit_last_am", ascending=False) \
    .head(10)

Unnamed: 0,profit_last_am
3763,150537.0
866,56086.0
2212,53989.0
1563,39328.0
2493,37055.5
3593,34814.5
4818,27807.0
3984,26764.5
4512,26721.0
1986,25257.0


`.describe()`

In [11]:
train[["profit_last_am"]].describe()

Unnamed: 0,profit_last_am
count,4947.0
mean,696.057712
std,3051.119275
min,0.0
25%,0.0
50%,52.0
75%,810.0
max,150537.0


## 5. Export to CSV

In [12]:
merged_df.to_csv("./data/exported/merged.csv", index=False)