## 1. Import libraries and data

In [38]:
import pandas as pd
import numpy as np

merged = pd.read_csv("./data/exported/merged.csv")
score = pd.read_csv("./data/score.csv")

## 2. Merge the merged df with score.csv

> we need to do this, so we can see which applicant_id matches with which applicant

Checking if shapes are same length

In [39]:
print(merged.shape)
print(score.shape)

(500, 5)
(500, 50)


In [40]:
score.head(3)

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,score1_pos,score1_neg,score2_pos,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg
0,5660.0,4320.0,8640.0,0.0,0.0,8000.0,0.0,0.0,1.0,0.0,...,0.538419,0.396819,0.423742,0.763608,,,,,,
1,3990.0,9.0,3450.0,0.0,0.0,12500.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
2,1158.0,82.0,4194.0,408.0,4.0,12000.0,0.0,0.0,0.0,1.0,...,0.009811,0.592842,,,0.252444,0.724693,0.818064,0.387361,,


In [41]:
merged.head(3)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit
0,0,0.0,0,2112.740231,2112.740231
1,1,0.0,0,2224.849003,2224.849003
2,2,0.0,0,-20.21309,-20.21309


In every model's notebook we do this code ...

``` python
applicants_data["predicted_XXX"] = XXX
applicants_data["applicant_id"] = applicants_data.index
```

... Which means the applicant_id is the same as the index. So we can just merge on index

In [42]:
merged_with_score = pd.merge(merged, score, left_index=True, right_index=True, how="left")

### **Testing if it merged correctly**

In [43]:
merged_check = merged_with_score[["income_am", "profit_am", "applicant_id", "predicted_profit"]]
merged_check

Unnamed: 0,income_am,profit_am,applicant_id,predicted_profit
0,5660.0,8640.0,0,2112.740231
1,3990.0,3450.0,1,2224.849003
2,1158.0,4194.0,2,-20.213090
3,2451.0,2119.0,3,1690.207478
4,946.0,2036.0,4,1566.944980
...,...,...,...,...
495,820.0,7794.0,495,2827.322292
496,6092.0,3137.0,496,2738.776238
497,2301.0,2516.0,497,2012.275412
498,492.0,3716.0,498,2197.382155


APPLICANT 100

In [44]:
merged.iloc[100]

applicant_id                  100.00000
predicted_damage_amount         0.00000
predicted_damage_incident       0.00000
predicted_revenue            6132.51123
predicted_profit             6132.51123
Name: 100, dtype: float64

In [45]:
score.iloc[100][["income_am", "profit_am"]]

income_am    122993.0
profit_am     42142.0
Name: 100, dtype: object

In [46]:
merged_check.iloc[100]

income_am           122993.00000
profit_am            42142.00000
applicant_id           100.00000
predicted_profit      6132.51123
Name: 100, dtype: float64

## 3. Get the scores of all applicants

I considered dropping all applicants with negative net scores (they're not good applicants, because their negativity-score outweighs their positivity-score). However, when I calculated the total predicted profit we came to `$387,366.87`, and if we do **not** drop the applicants with negative net scores the total predicted profit is `$481,812.19`.

In best interest of the hotel, I will pursue higher profits and thus not drop applicants with negative net scores

In [80]:
import sys
sys.path.append("./models/")

from calculate_scores import calculate_scores

Calculate the scores for each applicant

In [81]:
merged_calculated_scores = calculate_scores(merged_with_score.copy())
merged_calculated_scores["net_score"] = \
    merged_calculated_scores["combined_pos_score"] - merged_calculated_scores["combined_neg_score"]

~~Only keep applicants with positive `net_score`s~~

In [82]:
# merged_calculated_scores = merged_calculated_scores[merged_calculated_scores["net_score"] >= 0]
# merged_calculated_scores.shape

## 4. Get the 200 guests with the highest predicted profit

In [90]:
highest_profit = merged_calculated_scores \
    .sort_values(by="predicted_profit", ascending=False) \
    .head(200)

highest_profit_unsorted = highest_profit \
    .sort_values(by="applicant_id", ascending=True)

highest_profit_unsorted.head(40)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit,income_am,profit_last_am,profit_am,damage_am,damage_inc,...,claims_am,nights_booked,gender,shop_am,shop_use,retired,gold_status,combined_pos_score,combined_neg_score,net_score
0,0,0.0,0,2112.740231,2112.740231,5660.0,4320.0,8640.0,0.0,0.0,...,0.0,52.0,V,4577.729167,1.0,0.0,1.0,0.962161,1.160427,-0.198266
1,1,0.0,0,2224.849003,2224.849003,3990.0,9.0,3450.0,0.0,0.0,...,0.0,4.0,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,0.0,0,2309.345916,2309.345916,2591.0,869.0,2209.0,0.0,0.0,...,0.0,2.0,M,1415.021157,1.0,1.0,0.0,0.678448,5.867314,-5.188866
8,8,0.0,0,3326.684684,3326.684684,6426.0,2018.0,9015.0,2333.0,2.0,...,0.0,4.0,M,0.0,0.0,0.0,0.0,0.576605,0.277479,0.299126
15,15,0.0,0,2426.668743,2426.668743,1190.0,325.0,6688.0,1676.0,2.0,...,0.0,52.0,V,920.210464,1.0,0.0,0.0,0.411386,0.262099,0.149287
19,19,0.0,0,1956.409685,1956.409685,227.0,0.0,1620.0,0.0,0.0,...,0.0,1.0,V,0.0,0.0,0.0,0.0,0.971386,0.679476,0.291909
21,21,0.0,0,1969.733957,1969.733957,2093.0,681.0,2458.0,1219.0,4.0,...,0.0,83.0,M,0.0,0.0,0.0,0.0,0.391853,7.316181,-6.924328
24,24,0.0,0,1973.072259,1973.072259,227.0,0.0,1632.0,0.0,0.0,...,0.0,4.0,M,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25,25,0.0,0,2889.581225,2889.581225,10406.0,645.0,21633.0,0.0,0.0,...,0.0,27.0,V,0.0,0.0,1.0,0.0,1.124675,1.014803,0.109872
26,26,0.0,0,2009.570055,2009.570055,1343.0,374.0,1816.0,0.0,0.0,...,0.0,4.0,V,0.0,0.0,1.0,0.0,0.160819,0.251773,-0.090954


In [97]:
final_list = highest_profit_unsorted[[
    "applicant_id",
    "predicted_damage_amount",
    "predicted_damage_incident",
    "predicted_revenue",
    "predicted_profit",
]]

final_list_with_matching_row_number = final_list.copy()
final_list_with_matching_row_number["applicant_id"] += 2

# **TESTING**

In [98]:
highest_profit_unsorted[["predicted_profit"]].sum()

predicted_profit    481812.193118
dtype: float64

# **END**

In [99]:
final_list.to_csv("./guestlist/guestlist.csv", index=False)
final_list_with_matching_row_number.to_csv("./guestlist/guestlist_with_matching_row_number.csv", index=False)

## **! The applicant_id and index do not match the original row_number in the scores.csv**

They are 2 off, for some reason, e.g. applicant 100 in `highest_profit_unsorted` is actually row 102 in `scores.csv` 

You can tell because the `income_am` of row 102 in `scores.csv` is 122993.0
and in `score` the `income_am` of 100 is 122993.0




In [100]:
# this is row 102 in `scores.csv`
score.iloc[100]["income_am"]

122993.0

![screenshot that shows that index is off by 2](./public/screenshot.png)

### **HOWEVER**

It doesn't really matter.

If you are going by which row number in scores.csv got approved, then you can just subtract 2


If row 3 (with `income_am` of 3990.0) should have been approved, then now it's actually application_id 1

#### **Here are, according to my models, which 200 applicants should be approved**

In [101]:
result = final_list.index.to_numpy()

# np.savetxt("./guestlist/rows/result.csv", result, delimiter=",", fmt="%d")

result

array([  0,   1,   7,   8,  15,  19,  21,  24,  25,  26,  30,  31,  38,
        43,  45,  48,  50,  52,  53,  55,  58,  62,  64,  66,  67,  68,
        71,  72,  74,  75,  77,  78,  81,  84,  85,  90,  91,  94,  96,
        98, 100, 102, 103, 104, 105, 115, 117, 119, 121, 122, 124, 126,
       132, 139, 140, 143, 146, 147, 149, 150, 152, 153, 154, 155, 158,
       160, 162, 163, 165, 170, 171, 174, 175, 178, 179, 181, 183, 184,
       190, 191, 192, 193, 196, 197, 198, 199, 201, 202, 205, 208, 209,
       210, 213, 215, 217, 218, 222, 223, 229, 230, 236, 237, 240, 242,
       246, 248, 249, 251, 252, 253, 254, 256, 260, 261, 262, 263, 264,
       265, 267, 270, 274, 275, 277, 280, 288, 291, 292, 295, 296, 299,
       303, 304, 306, 307, 315, 317, 323, 328, 329, 334, 335, 338, 340,
       341, 342, 343, 349, 350, 353, 354, 357, 358, 359, 361, 371, 373,
       379, 388, 391, 393, 400, 401, 402, 405, 410, 423, 424, 426, 431,
       433, 436, 437, 439, 443, 448, 449, 450, 453, 456, 458, 45

#### **And here they are, with the number matching the row number from `scores.csv`**

In [102]:
result_with_matching_row_number = np.add(final_list.index.to_numpy(), 2)

# np.savetxt("./guestlist/rows/result_with_matching_row_number.csv", result_with_matching_row_number, delimiter=",", fmt="%d")

result_with_matching_row_number

array([  2,   3,   9,  10,  17,  21,  23,  26,  27,  28,  32,  33,  40,
        45,  47,  50,  52,  54,  55,  57,  60,  64,  66,  68,  69,  70,
        73,  74,  76,  77,  79,  80,  83,  86,  87,  92,  93,  96,  98,
       100, 102, 104, 105, 106, 107, 117, 119, 121, 123, 124, 126, 128,
       134, 141, 142, 145, 148, 149, 151, 152, 154, 155, 156, 157, 160,
       162, 164, 165, 167, 172, 173, 176, 177, 180, 181, 183, 185, 186,
       192, 193, 194, 195, 198, 199, 200, 201, 203, 204, 207, 210, 211,
       212, 215, 217, 219, 220, 224, 225, 231, 232, 238, 239, 242, 244,
       248, 250, 251, 253, 254, 255, 256, 258, 262, 263, 264, 265, 266,
       267, 269, 272, 276, 277, 279, 282, 290, 293, 294, 297, 298, 301,
       305, 306, 308, 309, 317, 319, 325, 330, 331, 336, 337, 340, 342,
       343, 344, 345, 351, 352, 355, 356, 359, 360, 361, 363, 373, 375,
       381, 390, 393, 395, 402, 403, 404, 407, 412, 425, 426, 428, 433,
       435, 438, 439, 441, 445, 450, 451, 452, 455, 458, 460, 46

Example:

applicant_id 1 became row 3: the guest with `income_am` of 3990.0

#### **The list of accepted applicants, with the other metrics (the deliverable), can be found in**

`/guestlist/result.csv` and `/guestlist/result_with_matching_row_numbers`