In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# PREPROCESSING

In [2]:
train = pd.read_excel("train.xlsx")
train.drop_duplicates(inplace = True)
train["Experience"] = train.Experience.apply(lambda x : ''.join(x.split(' ')[:1]))
train['Experience'] = pd.to_numeric(train['Experience'])
train["Rating"] = train.Rating.apply(lambda x: ''.join(str(x).split('%')[:1]) if pd.notnull(x) else x)
train['Rating'] = pd.to_numeric(train['Rating'], errors = 'coerce')
train['Rating'] = train['Rating'].fillna(0)
train['Rating'] = train['Rating'].astype('int64')
train['Place'].fillna('missing', inplace=True)
train["Area"] = train.Place.apply(lambda x: ''.join(str(x).split(',')[:1]) if pd.notnull(x) else x)
train["City"] = train.Place.apply(lambda x: ''.join(str(x).split(',')[1:]) if pd.notnull(x) else x)
train["City"] = train.City.apply(lambda x: ''.join(str(x).split(' ')[1:]) if pd.notnull(x) else x)
train['Miscellaneous_Info'].fillna('missing', inplace = True)
train["Feedbacks"] = train.Miscellaneous_Info.apply(lambda x: ''.join(str(x).split('%')[1:]) if pd.notnull(x) else x)
train["Feedbacks"] = train.Feedbacks.apply(lambda x: ''.join(str(x).split('F')[:1]) if pd.notnull(x) else x)
train["Feedbacks"] = train.Feedbacks.apply(lambda x: ''.join(str(x).split(' ')[1:]) if pd.notnull(x) else x)
train.loc[train['Feedbacks'] == '', 'Feedbacks'] = '0'
train['Feedbacks'] = pd.to_numeric(train['Feedbacks'], errors = 'coerce')
train['Feedbacks'].fillna(0, inplace = True)
train['Feedbacks'] = train['Feedbacks'].astype('int64')
train["Misc_Fees"] = train.Miscellaneous_Info.apply(lambda x: ''.join(str(x).split('₹')[1:]) if pd.notnull(x) else x)
train['Misc_Fees'] = train['Misc_Fees'].str.replace(',', '')
train["Misc_Fees"] = train.Misc_Fees.apply(lambda x: ''.join(str(x).split(' ')[:1]) if pd.notnull(x) else x)
train.loc[train['Misc_Fees'] == '', 'Misc_Fees'] = '0'
train['Misc_Fees'] = pd.to_numeric(train['Misc_Fees'], errors = 'coerce')
train['Misc_Fees'].fillna(0, inplace = True)
train['Misc_Fees'] = train['Misc_Fees'].astype('int64')
train.drop_duplicates(inplace = True)
train.reset_index(drop = True, inplace = True)

## WINSORIZING OUTLIERS

In [3]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.rcParams['figure.figsize'] = (15,7)
# f,(ax1, ax2) = plt.subplots(1,2)
# sns.boxplot(y = 'Experience', data = train, ax = ax1, palette = 'coolwarm')
# sns.boxplot(y = 'Rating', data = train, ax = ax2)
# f.tight_layout()

In [4]:
# for column in ['Experience', 'Rating']:
#     IQR = train[column].quantile(0.75) - train[column].quantile(0.25)
#     Lower_fence = train[column].quantile(0.25) - (IQR * 1.5)
#     Upper_fence = train[column].quantile(0.75) + (IQR * 1.5)
#     print(f'{column} outliers are values < {round(Lower_fence,2)} or > {round(Upper_fence,2)}')

In [5]:
# train['Experience'] = np.where(train['Experience'] > 44.0, 44.0, train['Experience'])
# train['Rating'] = np.where(train['Rating'] < 91.0, 91.0, train['Rating'])
# train['Rating'] = np.where(train['Rating'] > 99.0, 99.0, train['Rating'])

## DUMMIFICATION, MIN-MAX SCALING

In [3]:
X1 = pd.get_dummies(train[['Profile', 'City']], drop_first = True)

# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# scaled1 = scaler.fit_transform(train[['Experience', 'Rating']])
# scaled1 = pd.DataFrame(scaled1, columns = ['Experience', 'Rating'])

X1 = pd.concat([train[['Experience', 'Rating', 'Misc_Fees', 'Feedbacks']], X1], axis = 1)

## XGB RANDOM FOREST REGRESSOR

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X1, train['Fees'], test_size = 0.3, random_state = 1234)

from xgboost import XGBRFRegressor
model = XGBRFRegressor(random_state = 1234)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
result_train = 1 - np.sqrt(np.square(np.log10(y_pred_train +1) - np.log10(y_train +1)).mean())
y_pred_test = model.predict(X_test)
result_test = 1 - np.sqrt(np.square(np.log10(y_pred_test +1) - np.log10(y_test +1)).mean())
print("Train Result:", result_train)
print("Test Result:", result_test)

Train Result: 0.7260061827421758
Test Result: 0.7268551032763886


## RFECV

In [5]:
from sklearn.feature_selection import RFECV

rfecv = RFECV(estimator = model, step = 1, cv = 5, scoring = 'r2')
rfecv = rfecv.fit(X_train, y_train)

print("The optimal number of features:", rfecv.n_features_)
print("Best features:", X_train.columns[rfecv.support_])

The optimal number of features: 16
Best features: Index(['Experience', 'Rating', 'Misc_Fees', 'Feedbacks', 'Profile_Dentist',
       'Profile_Dermatologists', 'Profile_ENT Specialist',
       'Profile_General Medicine', 'Profile_Homeopath', 'City_Bangalore',
       'City_Chennai', 'City_Coimbatore', 'City_Delhi', 'City_Ernakulam',
       'City_Mumbai', 'City_Thiruvananthapuram'],
      dtype='object')


In [6]:
X_train_selected = rfecv.transform(X_train)
X_test_selected = rfecv.transform(X_test)

model = XGBRFRegressor(random_state = 1234)
model.fit(X_train_selected, y_train)
y_pred_train = model.predict(X_train_selected)
result_train = 1 - np.sqrt(np.square(np.log10(y_pred_train +1) - np.log10(y_train +1)).mean())
y_pred_test = model.predict(X_test_selected)
result_test = 1 - np.sqrt(np.square(np.log10(y_pred_test +1) - np.log10(y_test +1)).mean())
print("Train Result:", result_train)
print("Test Result:", result_test)

Train Result: 0.7256591763365267
Test Result: 0.7266961484398649


In [7]:
import optuna

# def custom_metric(y_true, y_pred):
#     return 1 - np.sqrt(np.square(np.log10(y_pred_test +1) - np.log10(y_test +1)).mean())

def objective(trial):
    param_space = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
        #'objective': 'reg:msle',
        #'eval_metric': custom_metric
    }
    
    model = XGBRFRegressor(random_state = 1234, **param_space)
    model.fit(X_train_selected, y_train)
    y_pred_test = model.predict(X_test_selected)
    result_test = 1 - np.sqrt(np.square(np.log10(y_pred_test +1) - np.log10(y_test +1)).mean())
    return result_test

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 20, show_progress_bar = True)

[I 2023-06-20 16:05:56,672] A new study created in memory with name: no-name-7784f8ff-a00f-453b-a726-b4eaa42706f6


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2023-06-20 16:05:57,409] Trial 0 finished with value: -0.02691558977087305 and parameters: {'n_estimators': 223, 'max_depth': 7, 'learning_rate': 0.08225016588110448, 'subsample': 0.6566240518508436, 'colsample_bytree': 0.8421330375094473, 'reg_alpha': 0.9791637022628278, 'reg_lambda': 0.8181412397082952}. Best is trial 0 with value: -0.02691558977087305.
[I 2023-06-20 16:05:58,970] Trial 1 finished with value: -0.13507724045917535 and parameters: {'n_estimators': 476, 'max_depth': 8, 'learning_rate': 0.06266092649114956, 'subsample': 0.552698767421453, 'colsample_bytree': 0.9129394771480208, 'reg_alpha': 0.48281094353963727, 'reg_lambda': 0.22941959679234036}. Best is trial 0 with value: -0.02691558977087305.
[I 2023-06-20 16:06:00,439] Trial 2 finished with value: -0.8874083385848384 and parameters: {'n_estimators': 794, 'max_depth': 5, 'learning_rate': 0.0064614754232904725, 'subsample': 0.8106757209436106, 'colsample_bytree': 0.8999733096458864, 'reg_alpha': 0.6747171994340752, 

In [9]:
model = XGBRFRegressor(random_state = 1234, **trial.params)
model.fit(X_train_selected, y_train)
y_pred_train = model.predict(X_train_selected)
result_train = 1 - np.sqrt(np.square(np.log10(y_pred_train +1) - np.log10(y_train +1)).mean())
y_pred_test = model.predict(X_test_selected)
result_test = 1 - np.sqrt(np.square(np.log10(y_pred_test +1) - np.log10(y_test +1)).mean())
print("Train Result:", result_train)
print("Test Result:", result_test)

NameError: name 'trial' is not defined

# SUBMISSION

## PREPROCESSING

In [56]:
test = pd.read_excel("test.xlsx")
test["Experience"] = test.Experience.apply(lambda x : ''.join(x.split(' ')[:1]))
test['Experience'] = pd.to_numeric(test['Experience'])
test["Rating"] = test.Rating.apply(lambda x: ''.join(str(x).split('%')[:1]) if pd.notnull(x) else x)
test['Rating'] = pd.to_numeric(test['Rating'], errors = 'coerce')
test['Rating'] = test['Rating'].fillna(0)
test['Rating'] = test['Rating'].astype('int64')
test['Place'].fillna('missing', inplace = True)
test["Area"] = test.Place.apply(lambda x: ''.join(str(x).split(',')[:1]) if pd.notnull(x) else x)
test["City"] = test.Place.apply(lambda x: ''.join(str(x).split(',')[1:]) if pd.notnull(x) else x)
test["City"] = test.City.apply(lambda x: ''.join(str(x).split(' ')[1:]) if pd.notnull(x) else x)
test['Miscellaneous_Info'].fillna('missing', inplace = True)
test["Feedbacks"] = test.Miscellaneous_Info.apply(lambda x: ''.join(str(x).split('%')[1:]) if pd.notnull(x) else x)
test["Feedbacks"] = test.Feedbacks.apply(lambda x: ''.join(str(x).split('F')[:1]) if pd.notnull(x) else x)
test["Feedbacks"] = test.Feedbacks.apply(lambda x: ''.join(str(x).split(' ')[1:]) if pd.notnull(x) else x)
test.loc[test['Feedbacks'] == '', 'Feedbacks'] = '0'
test['Feedbacks'] = pd.to_numeric(test['Feedbacks'], errors = 'coerce')
test['Feedbacks'].fillna(0, inplace = True)
test['Feedbacks'] = test['Feedbacks'].astype('int64')
test["Misc_Fees"] = test.Miscellaneous_Info.apply(lambda x: ''.join(str(x).split('₹')[1:]) if pd.notnull(x) else x)
test['Misc_Fees'] = test['Misc_Fees'].str.replace(',', '')
test["Misc_Fees"] = test.Misc_Fees.apply(lambda x: ''.join(str(x).split(' ')[:1]) if pd.notnull(x) else x)
test.loc[test['Misc_Fees'] == '', 'Misc_Fees'] = '0'
test['Misc_Fees'] = pd.to_numeric(test['Misc_Fees'], errors = 'coerce')
test['Misc_Fees'].fillna(0, inplace = True)
test['Misc_Fees'] = test['Misc_Fees'].astype('int64')

## WINSORIZING OUTLIERS

In [9]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.rcParams['figure.figsize'] = (15,7)
# f,(ax1, ax2) = plt.subplots(1,2)
# sns.boxplot(y = 'Experience', data = test, ax = ax1, palette = 'coolwarm')
# sns.boxplot(y = 'Rating', data = test, ax = ax2)
# f.tight_layout()

In [10]:
# for column in ['Experience', 'Rating']:
#     IQR = test[column].quantile(0.75) - test[column].quantile(0.25)
#     Lower_fence = test[column].quantile(0.25) - (IQR * 1.5)
#     Upper_fence = test[column].quantile(0.75) + (IQR * 1.5)
#     print(f'{column} outliers are values < {round(Lower_fence,2)} or > {round(Upper_fence,2)}')

In [11]:
# test['Experience'] = np.where(test['Experience'] > 46.5, 46.5, test['Experience'])
# test['Rating'] = np.where(test['Rating'] < 91.0, 91.0, test['Rating'])
# test['Rating'] = np.where(test['Rating'] > 99.0, 99.0, test['Rating'])

## DUMMIFICATION, MIN-MAX SCALING

In [59]:
X2 = pd.get_dummies(test[['Profile', 'City']], drop_first = True)

# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# scaled2 = scaler.fit_transform(test[['Experience', 'Rating']])
# scaled_df2 = pd.DataFrame(scaled2, columns = ['Experience', 'Rating'])

X2 = pd.concat([test[['Experience', 'Rating', 'Misc_Fees', 'Feedbacks']], X2], axis = 1)

X2['City_Sector5Delhi'] = 0
column = X2.pop('City_Sector5Delhi')
X2.insert(16, 'City_Sector5Delhi', column)

## WRITE INTO SUBMISSION FILE

In [60]:
# X2 = rfecv.transform(X2)
test['Fees'] = model.predict(X2)
test['Fees'].to_excel('submission.xlsx', index = False)

In [None]:
# XGB WITH OPTUNA - 0.71910
# XGB RANDOM FOREST REGRESSOR - 0.71483
# LINEAR REGRESSION - 0.71378
# XGB RANDOM FOREST REGRESSOR WITH FEATURE IMPORTANCE - 0.71284
# DECISION TREE REGRESSOR - 0.71226