## 0. Configs

In [1]:
import sys
sys.path.append("../../util/")

from calculate_scores import calculate_scores

In [2]:
import locale
locale.setlocale(locale.LC_ALL, '')

def as_currency(amount):
    return locale.currency(amount, grouping=True)

In [3]:
import warnings

warnings.filterwarnings('ignore')

## 1. Import libraries and data

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../../data/cleaned/train_V2_cleaned.csv")
df.head()

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,shop_use,retired,gold_status,outcome_profit,outcome_damage_inc,outcome_damage_amount,gender_M,gender_V,combined_pos_score,combined_neg_score
0,268.0,16.0,1682.0,0.0,0.0,750.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1672.78,1,829.66,1,0,0.0,7.955259
1,283.0,23.0,1673.0,0.0,0.0,750.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1001.4,0,0.0,1,0,0.33433,1.842549
2,227.0,0.0,1685.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1785.59,0,0.0,0,1,0.0,0.889793
3,227.0,0.0,1620.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.151873,0.0,0.0,1127.05,0,0.0,0,1,0.0,0.0
4,229.0,810.0,1620.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1640.31,0,0.0,1,0,0.0,6.894609


In [5]:
df.columns

Index(['income_am', 'profit_last_am', 'profit_am', 'damage_am', 'damage_inc',
       'crd_lim_rec', 'credit_use_ic', 'gluten_ic', 'lactose_ic',
       'insurance_ic', 'spa_ic', 'empl_ic', 'cab_requests', 'married_cd',
       'bar_no', 'sport_ic', 'neighbor_income', 'age', 'marketing_permit',
       'urban_ic', 'dining_ic', 'presidential', 'client_segment', 'sect_empl',
       'prev_stay', 'prev_all_in_stay', 'divorce', 'fam_adult_size',
       'children_no', 'tenure_mts', 'tenure_yrs', 'company_ic', 'claims_no',
       'claims_am', 'nights_booked', 'shop_am', 'shop_use', 'retired',
       'gold_status', 'outcome_profit', 'outcome_damage_inc',
       'outcome_damage_amount', 'gender_M', 'gender_V', 'combined_pos_score',
       'combined_neg_score'],
      dtype='object')

## 2. Preparation for model training

### **X, y split**

Split the columns which I will need to train a model. 

In [6]:
X = df.drop(['outcome_damage_amount', 'outcome_profit', 'outcome_damage_inc'], axis=1)
y = df['outcome_damage_amount']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

## 3. Train both models

In [8]:
from sklearn.model_selection import GridSearchCV

### **1. Lasso**

Get best parameters using GridSearchCV

In [9]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso_param_grid = {'alpha': np.linspace(0.01, 10.0, num=100)}
lasso_grid_search = GridSearchCV(lasso, param_grid=lasso_param_grid, cv=5)
lasso_grid_search.fit(X, y)
print("Best Lasso Parameters:", lasso_grid_search.best_params_)

Best Lasso Parameters: {'alpha': 3.037272727272727}


Make prediction on test set

In [10]:
lasso_pred = lasso_grid_search.predict(X_test)

### **2. Ridge**

In [11]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge_param_grid = {'alpha': np.linspace(0.01, 10.0, num=100)}
ridge_grid_search = GridSearchCV(ridge, param_grid=ridge_param_grid, cv=5)
ridge_grid_search.fit(X, y)
print("Best Ridge Parameters:", ridge_grid_search.best_params_)

Best Ridge Parameters: {'alpha': 10.0}


Make prediction on test set

In [12]:
ridge_pred = ridge_grid_search.predict(X_test)

### **Compare**

In [13]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [14]:
# Metrics for RIDGE
mse_ridge = np.sqrt(mean_squared_error(y_test, ridge_pred))
mae_ridge = mean_absolute_error(y_test, ridge_pred)
r2_ridge = r2_score(y_test, ridge_pred)

# Metrics for LASSO
mse_lasso = np.sqrt(mean_squared_error(y_test, lasso_pred))
mae_lasso = mean_absolute_error(y_test, lasso_pred)
r2_lasso = r2_score(y_test, lasso_pred)

In [15]:
all_MSEs = [mse_ridge, mse_lasso]
all_MAEs = [mae_ridge, mae_lasso]
all_R2s  = [ r2_ridge,  r2_lasso]

models = pd.DataFrame({ 'Model': ['Ridge','Lasso'],
                        'Root MSE': all_MSEs,
                        'MAE': all_MAEs,
                        'R2 Score': all_R2s,
                        })

# FE = Feature Engineering
FE = "Root MSE + MAE"
models[FE] = ((models["Root MSE"] + models["MAE"]))  

models.sort_values(by = FE, ascending = True, ignore_index = True)

Unnamed: 0,Model,Root MSE,MAE,R2 Score,Root MSE + MAE
0,Ridge,286.914007,206.954705,0.089215,493.868711
1,Lasso,287.694755,207.970853,0.084252,495.665609


## 4. Make prediction on `score.csv` using best model

### **Import applicant data CSV**, and make adjustments

In [16]:
applicants_data = pd.read_csv("../../data/score.csv")

"`get_dummies`" the same columns as we did for the training data

In [17]:
applicants_data = pd.get_dummies(applicants_data, columns=['gender'])

# execute `calculate_scores` on the DF, to calculate the scores
applicants_data = calculate_scores(applicants_data.copy())

applicants_data.fillna(0, inplace=True)

### **Make prediction on the test set**

In [18]:
predicted_damage_amount = ridge_grid_search.predict(applicants_data)

## 5. Score applicants

I'm not actually scoring applicants (read `IMPORTANT`)

### **Make new DataFrame with `predicted_revenue`**

In [19]:
applicants_data["predicted_damage_amount"] = predicted_damage_amount
applicants_data["applicant_id"] = applicants_data.index

applicants_with_predicted_damage_amount = applicants_data[["applicant_id", 'predicted_damage_amount']]
# applicants_with_predicted_damage_amount = applicants_with_predicted_damage_amount.sort_values(by='predicted_damage_amount', ascending=False)

In [20]:
applicants_with_predicted_damage_amount

Unnamed: 0,applicant_id,predicted_damage_amount
0,0,173.082543
1,1,169.469756
2,2,385.169337
3,3,199.216735
4,4,10.005540
...,...,...
495,495,165.396002
496,496,214.415460
497,497,161.060831
498,498,62.136630


### **Export dataframe**

In [21]:
applicants_with_predicted_damage_amount.to_csv("../../data/exported/predicted_damage_amount.csv", index=False)

# **IMPORTANT**

in the future I will merge all 3 dataframes with each other, and then calculate the outcome_profit and select the top 200