## 1. Import libraries and data

In [1]:
import pandas as pd
import numpy as np

merged = pd.read_csv("./data/exported/merged.csv")
score = pd.read_csv("./data/score.csv")

## 2. Merge the merged df with score.csv

> we need to do this, so we can see which applicant_id matches with which applicant

Checking if shapes are same length

In [2]:
print(merged.shape)
print(score.shape)

(500, 5)
(500, 50)


In [3]:
score.head(3)

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,score1_pos,score1_neg,score2_pos,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg
0,5660.0,4320.0,8640.0,0.0,0.0,8000.0,0.0,0.0,1.0,0.0,...,0.538419,0.396819,0.423742,0.763608,,,,,,
1,3990.0,9.0,3450.0,0.0,0.0,12500.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
2,1158.0,82.0,4194.0,408.0,4.0,12000.0,0.0,0.0,0.0,1.0,...,0.009811,0.592842,,,0.252444,0.724693,0.818064,0.387361,,


In [4]:
merged.head(3)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit
0,0,0.0,0,2006.474841,2006.474841
1,1,0.0,0,2115.739438,2115.739438
2,2,328.836,1,95.451656,-233.384344


In every model's notebook we do this code ...

``` python
applicants_data["predicted_XXX"] = XXX
applicants_data["applicant_id"] = applicants_data.index
```

... Which means the applicant_id is the same as the index. So we can just merge on index

In [5]:
merged_with_score = pd.merge(merged, score, left_index=True, right_index=True, how="left")

### **Testing if it merged correctly**

In [6]:
merged_check = merged_with_score[["income_am", "profit_am", "applicant_id", "predicted_profit"]]
merged_check

Unnamed: 0,income_am,profit_am,applicant_id,predicted_profit
0,5660.0,8640.0,0,2006.474841
1,3990.0,3450.0,1,2115.739438
2,1158.0,4194.0,2,-233.384344
3,2451.0,2119.0,3,1594.570450
4,946.0,2036.0,4,1470.689027
...,...,...,...,...
495,820.0,7794.0,495,2706.903545
496,6092.0,3137.0,496,2477.268876
497,2301.0,2516.0,497,1890.432834
498,492.0,3716.0,498,2117.525376


APPLICANT 100

In [7]:
merged.iloc[100]

applicant_id                  100.000000
predicted_damage_amount         0.000000
predicted_damage_incident       0.000000
predicted_revenue            5400.725472
predicted_profit             5400.725472
Name: 100, dtype: float64

In [8]:
score.iloc[100][["income_am", "profit_am"]]

income_am    122993.0
profit_am     42142.0
Name: 100, dtype: object

In [9]:
merged_check.iloc[100]

income_am           122993.000000
profit_am            42142.000000
applicant_id           100.000000
predicted_profit      5400.725472
Name: 100, dtype: float64

## 3. Get the 150 guests with the highest predicted profit

In [10]:
highest_profit = merged_with_score \
    .sort_values(by="predicted_profit", ascending=False) \
    .head(200)

highest_profit_unsorted = highest_profit \
    .sort_values(by="applicant_id", ascending=True)

highest_profit_unsorted.head(40)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit,income_am,profit_last_am,profit_am,damage_am,damage_inc,...,score1_pos,score1_neg,score2_pos,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg
0,0,0.0,0,2006.474841,2006.474841,5660.0,4320.0,8640.0,0.0,0.0,...,0.538419,0.396819,0.423742,0.763608,,,,,,
1,1,0.0,0,2115.739438,2115.739438,3990.0,9.0,3450.0,0.0,0.0,...,,,,,,,,,,
7,7,112.164,1,2097.122193,1984.958193,2591.0,869.0,2209.0,0.0,0.0,...,,,,,0.580498,0.585224,,,0.09795,5.28209
8,8,0.0,0,3029.583549,3029.583549,6426.0,2018.0,9015.0,2333.0,2.0,...,,,,,0.576605,0.277479,,,,
15,15,0.0,0,2099.704475,2099.704475,1190.0,325.0,6688.0,1676.0,2.0,...,,,,,0.245555,0.164753,0.165831,0.097346,,
19,19,0.0,0,1938.393301,1938.393301,227.0,0.0,1620.0,0.0,0.0,...,0.971386,0.679476,,,,,,,,
21,21,0.0,0,1924.944795,1924.944795,2093.0,681.0,2458.0,1219.0,4.0,...,,,,,0.061794,0.359742,,,0.330059,6.956439
24,24,0.0,0,1974.962854,1974.962854,227.0,0.0,1632.0,0.0,0.0,...,,,,,,,,,,
25,25,0.0,0,3057.184808,3057.184808,10406.0,645.0,21633.0,0.0,0.0,...,0.293698,0.064613,0.428241,0.649497,0.402736,0.300693,,,,
26,26,0.0,0,1951.803386,1951.803386,1343.0,374.0,1816.0,0.0,0.0,...,,,0.160819,0.251773,,,,,,


In [11]:
final_list = highest_profit_unsorted[[
    "applicant_id",
    "predicted_damage_amount",
    "predicted_damage_incident",
    "predicted_revenue",
    "predicted_profit"
]]

final_list_with_matching_row_number = final_list.copy()
final_list_with_matching_row_number["applicant_id"] += 2

In [12]:
final_list.to_csv("./guestlist/guestlist.csv", index=False)
final_list_with_matching_row_number.to_csv("./guestlist/guestlist_with_matching_row_number.csv", index=False)

## **! The applicant_id and index do not match the original row_number in the scores.csv**

They are 2 off, for some reason, e.g. applicant 100 in `highest_profit_unsorted` is actually row 102 in `scores.csv` 

You can tell because the `income_am` of row 102 in `scores.csv` is 122993.0
and in `score` the `income_am` of 100 is 122993.0




In [13]:
# this is row 102 in `scores.csv`
score.iloc[100]["income_am"]

122993.0

![screenshot that shows that index is off by 2](./public/screenshot.png)

### **HOWEVER**

It doesn't really matter.

If you are going by which row number in scores.csv got approved, then you can just subtract 2


If row 3 (with `income_am` of 3990.0) should have been approved, then now it's actually application_id 1

#### **Here are, according to my models, which 200 applicants should be approved**

In [14]:
result = final_list.index.to_numpy()

# np.savetxt("./guestlist/rows/result.csv", result, delimiter=",", fmt="%d")

result

array([  0,   1,   7,   8,  15,  19,  21,  24,  25,  26,  31,  43,  45,
        48,  50,  52,  53,  54,  55,  58,  62,  64,  66,  67,  68,  69,
        71,  72,  74,  75,  77,  78,  79,  81,  84,  85,  90,  91,  94,
        96,  98, 100, 102, 103, 104, 105, 117, 119, 121, 122, 124, 126,
       132, 140, 142, 146, 147, 149, 150, 152, 153, 154, 155, 158, 160,
       162, 163, 170, 171, 172, 174, 175, 176, 178, 179, 182, 183, 184,
       190, 191, 192, 193, 196, 197, 198, 199, 201, 202, 205, 208, 209,
       210, 217, 221, 222, 223, 230, 236, 237, 240, 245, 246, 251, 252,
       253, 254, 256, 259, 260, 261, 262, 264, 265, 267, 270, 274, 277,
       280, 282, 284, 288, 291, 292, 295, 299, 303, 304, 306, 307, 308,
       315, 317, 323, 326, 327, 328, 329, 334, 335, 338, 341, 342, 343,
       349, 350, 353, 354, 357, 358, 361, 371, 373, 379, 388, 391, 393,
       394, 398, 400, 401, 402, 404, 405, 407, 410, 414, 424, 426, 431,
       433, 436, 437, 439, 443, 448, 449, 450, 451, 453, 455, 45

#### **And here they are, with the number matching the row number from `scores.csv`**

In [15]:
result_with_matching_row_number = np.add(final_list.index.to_numpy(), 2)

# np.savetxt("./guestlist/rows/result_with_matching_row_number.csv", result_with_matching_row_number, delimiter=",", fmt="%d")

result_with_matching_row_number

array([  2,   3,   9,  10,  17,  21,  23,  26,  27,  28,  33,  45,  47,
        50,  52,  54,  55,  56,  57,  60,  64,  66,  68,  69,  70,  71,
        73,  74,  76,  77,  79,  80,  81,  83,  86,  87,  92,  93,  96,
        98, 100, 102, 104, 105, 106, 107, 119, 121, 123, 124, 126, 128,
       134, 142, 144, 148, 149, 151, 152, 154, 155, 156, 157, 160, 162,
       164, 165, 172, 173, 174, 176, 177, 178, 180, 181, 184, 185, 186,
       192, 193, 194, 195, 198, 199, 200, 201, 203, 204, 207, 210, 211,
       212, 219, 223, 224, 225, 232, 238, 239, 242, 247, 248, 253, 254,
       255, 256, 258, 261, 262, 263, 264, 266, 267, 269, 272, 276, 279,
       282, 284, 286, 290, 293, 294, 297, 301, 305, 306, 308, 309, 310,
       317, 319, 325, 328, 329, 330, 331, 336, 337, 340, 343, 344, 345,
       351, 352, 355, 356, 359, 360, 363, 373, 375, 381, 390, 393, 395,
       396, 400, 402, 403, 404, 406, 407, 409, 412, 416, 426, 428, 433,
       435, 438, 439, 441, 445, 450, 451, 452, 453, 455, 457, 45

Example:

applicant_id 1 became row 3: the guest with `income_am` of 3990.0

#### **The list of accepted applicants, with the other metrics (the deliverable), can be found in**

`/guestlist/result.csv` and `/guestlist/result_with_matching_row_numbers`