## 1. Import libraries and data

In [1]:
import pandas as pd
import numpy as np

merged = pd.read_csv("./data/exported/merged.csv")
score = pd.read_csv("./data/score.csv")

## 2. Merge the merged df with score.csv

> we need to do this, so we can see which applicant_id matches with which applicant

Checking if shapes are same length

In [2]:
print(merged.shape)
print(score.shape)

(500, 5)
(500, 50)


In [3]:
score.head(3)

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,score1_pos,score1_neg,score2_pos,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg
0,5660.0,4320.0,8640.0,0.0,0.0,8000.0,0.0,0.0,1.0,0.0,...,0.538419,0.396819,0.423742,0.763608,,,,,,
1,3990.0,9.0,3450.0,0.0,0.0,12500.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
2,1158.0,82.0,4194.0,408.0,4.0,12000.0,0.0,0.0,0.0,1.0,...,0.009811,0.592842,,,0.252444,0.724693,0.818064,0.387361,,


In [4]:
merged.head(3)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit
0,0,0.0,0,2125.775778,2125.775778
1,1,0.0,0,2253.402897,2253.402897
2,2,0.0,0,-22.878184,-22.878184


In every model's notebook we do this code ...

``` python
applicants_data["predicted_XXX"] = XXX
applicants_data["applicant_id"] = applicants_data.index
```

... Which means the applicant_id is the same as the index. So we can just merge on index

In [5]:
merged_with_score = pd.merge(merged, score, left_index=True, right_index=True, how="left")

### **Testing if it merged correctly**

In [6]:
merged_check = merged_with_score[["income_am", "profit_am", "applicant_id", "predicted_profit"]]
merged_check

Unnamed: 0,income_am,profit_am,applicant_id,predicted_profit
0,5660.0,8640.0,0,2125.775778
1,3990.0,3450.0,1,2253.402897
2,1158.0,4194.0,2,-22.878184
3,2451.0,2119.0,3,1762.829728
4,946.0,2036.0,4,1542.663912
...,...,...,...,...
495,820.0,7794.0,495,2871.408578
496,6092.0,3137.0,496,2628.631360
497,2301.0,2516.0,497,2031.616643
498,492.0,3716.0,498,2191.590690


APPLICANT 100

In [7]:
merged.iloc[100]

applicant_id                  100.000000
predicted_damage_amount         0.000000
predicted_damage_incident       0.000000
predicted_revenue            6485.252989
predicted_profit             6485.252989
Name: 100, dtype: float64

In [8]:
score.iloc[100][["income_am", "profit_am"]]

income_am    122993.0
profit_am     42142.0
Name: 100, dtype: object

In [9]:
merged_check.iloc[100]

income_am           122993.000000
profit_am            42142.000000
applicant_id           100.000000
predicted_profit      6485.252989
Name: 100, dtype: float64

## 3. Get the 150 guests with the highest predicted profit

In [10]:
highest_profit = merged_with_score \
    .sort_values(by="predicted_profit", ascending=False) \
    .head(150)

highest_profit_unsorted = highest_profit \
    .sort_values(by="applicant_id", ascending=True)

highest_profit_unsorted.head(40)

Unnamed: 0,applicant_id,predicted_damage_amount,predicted_damage_incident,predicted_revenue,predicted_profit,income_am,profit_last_am,profit_am,damage_am,damage_inc,...,score1_pos,score1_neg,score2_pos,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg
0,0,0.0,0,2125.775778,2125.775778,5660.0,4320.0,8640.0,0.0,0.0,...,0.538419,0.396819,0.423742,0.763608,,,,,,
1,1,0.0,0,2253.402897,2253.402897,3990.0,9.0,3450.0,0.0,0.0,...,,,,,,,,,,
7,7,0.0,0,2237.384491,2237.384491,2591.0,869.0,2209.0,0.0,0.0,...,,,,,0.580498,0.585224,,,0.09795,5.28209
8,8,0.0,0,3345.808522,3345.808522,6426.0,2018.0,9015.0,2333.0,2.0,...,,,,,0.576605,0.277479,,,,
15,15,0.0,0,2361.134628,2361.134628,1190.0,325.0,6688.0,1676.0,2.0,...,,,,,0.245555,0.164753,0.165831,0.097346,,
25,25,0.0,0,3191.459446,3191.459446,10406.0,645.0,21633.0,0.0,0.0,...,0.293698,0.064613,0.428241,0.649497,0.402736,0.300693,,,,
31,31,0.0,0,2792.73182,2792.73182,14556.0,3440.0,4013.0,0.0,0.0,...,,,,,0.700267,0.399948,,,,
38,38,0.0,0,2092.046612,2092.046612,1909.0,671.0,4239.0,1864.0,3.0,...,,,,,,,0.833463,0.975494,0.472664,8.995981
45,45,0.0,0,4573.05313,4573.05313,41694.0,8178.0,21818.0,236.0,2.0,...,,,,,0.84747,0.847966,0.638914,0.659615,,
48,48,0.0,0,3669.262192,3669.262192,23268.0,543.0,20942.0,0.0,0.0,...,0.35543,0.559121,,,0.317268,0.267638,,,,


## **! The applicant_id and index do not match the original row_number in the scores.csv**

They are 2 off, for some reason, e.g. applicant 100 in `highest_profit_unsorted` is actually row 102 in `scores.csv` 

You can tell because the `income_am` of row 102 in `scores.csv` is 122993.0
and in `score` the `income_am` of 100 is 122993.0




In [11]:
# this is row 102 in `scores.csv`
score.iloc[100]["income_am"]

122993.0

![screenshot that shows that index is off by 2](./public/screenshot.png)

### **HOWEVER**

It doesn't really matter.

If you are going by which row number in scores.csv got approved, then you can just subtract 2


If row 3 (with `income_am` of 3990.0) should have been approved, then now it's actually application_id 1

#### **Here are, according to my models, which 150 applicants should be approved**

In [26]:
result = highest_profit_unsorted.index.to_numpy()

np.savetxt("./guestlist/result.csv", result, delimiter=",", fmt="%d")

result

array([  0,   1,   7,   8,  15,  25,  31,  38,  45,  48,  50,  53,  54,
        55,  58,  62,  64,  66,  67,  68,  71,  74,  75,  77,  78,  81,
        84,  85,  91,  96,  98, 100, 102, 104, 105, 117, 119, 121, 122,
       124, 132, 139, 140, 146, 149, 150, 152, 153, 154, 155, 158, 160,
       162, 163, 170, 171, 174, 175, 178, 179, 183, 184, 190, 191, 192,
       196, 198, 199, 201, 205, 208, 209, 210, 215, 217, 218, 223, 237,
       240, 246, 251, 253, 254, 256, 260, 262, 264, 265, 267, 270, 274,
       277, 288, 291, 295, 299, 303, 304, 306, 307, 317, 323, 334, 335,
       338, 341, 342, 349, 350, 353, 357, 358, 361, 368, 371, 373, 379,
       388, 391, 400, 402, 405, 424, 426, 431, 433, 436, 437, 443, 448,
       449, 456, 458, 459, 460, 463, 464, 465, 472, 473, 475, 476, 481,
       484, 490, 491, 494, 495, 496, 498], dtype=int64)

#### **And here they are, with the number matching the row number from `scores.csv`**

In [27]:
result_with_matching_row_number = np.add(highest_profit_unsorted.index.to_numpy(), 2)

np.savetxt("./guestlist/result_with_matching_row_number.csv", result_with_matching_row_number, delimiter=",", fmt="%d")

result_with_matching_row_number

array([  2,   3,   9,  10,  17,  27,  33,  40,  47,  50,  52,  55,  56,
        57,  60,  64,  66,  68,  69,  70,  73,  76,  77,  79,  80,  83,
        86,  87,  93,  98, 100, 102, 104, 106, 107, 119, 121, 123, 124,
       126, 134, 141, 142, 148, 151, 152, 154, 155, 156, 157, 160, 162,
       164, 165, 172, 173, 176, 177, 180, 181, 185, 186, 192, 193, 194,
       198, 200, 201, 203, 207, 210, 211, 212, 217, 219, 220, 225, 239,
       242, 248, 253, 255, 256, 258, 262, 264, 266, 267, 269, 272, 276,
       279, 290, 293, 297, 301, 305, 306, 308, 309, 319, 325, 336, 337,
       340, 343, 344, 351, 352, 355, 359, 360, 363, 370, 373, 375, 381,
       390, 393, 402, 404, 407, 426, 428, 433, 435, 438, 439, 445, 450,
       451, 458, 460, 461, 462, 465, 466, 467, 474, 475, 477, 478, 483,
       486, 492, 493, 496, 497, 498, 500], dtype=int64)

Example:

applicant_id 1 became row 3: the guest with `income_am` of 3990.0

#### **The list of accepted applicants can be found in**

`/guestlist/result.csv` and `/guestlist/result_with_matching_row_numbers`