In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score

In [7]:
df_targets = pd.read_parquet('train_targets.parquet')
df_train = pd.read_parquet('train_with_feats_v4.parquet')

df_all = pd.merge(df_train, df_targets, on=['customer_id'])
df_all_dropna = df_all.dropna()
df_train_feats_non_churn_v4 = df_all_dropna.drop(columns=['buy_post', 'customer_id', 'date_diff_post'])
df_targets_non_churn_v4 = df_all_dropna.date_diff_post

In [None]:
model500 = GradientBoostingRegressor(n_estimators=500)
model500.fit(df_train_feats_non_churn_v4, df_targets_non_churn_v4)

In [None]:
selected_features = ['receipt_count', 'sum_delta_min', 'sum_delta_max', 'sum_delta_std',
 'abs_sum_delta_mean', 'revenue_sum_max_7d', 'revenue_sum_mean_7d',
 'revenue_sum_max_14d', 'revenue_sum_mean_14d', 'revenue_sum_max_28d'
 'revenue_sum_mean_28d' 'minutes_mean' 'minutes_std' 'minutes_sem'
 'dom_median' 'first_half_mean' 'first_half_max' 'first_half_min'
 'first_half_std' 'second_half_max' 'second_half_min' 'second_half_std']

model500_sf = GradientBoostingRegressor(n_estimators=500)
model500_sf.fit(df_train_feats_non_churn_v4[selected_features], df_targets_non_churn_v4)

In [8]:
X_train_v4, X_val_v4, y_train_v4, y_val_v4 = train_test_split(df_train_feats_non_churn_v4, df_targets_non_churn_v4, test_size=0.2,
                                                  random_state=42)

In [None]:
from sklearn.linear_model import LassoCV
lasso_cv = LassoCV()
lasso_cv.fit(X_train_v4, y_train_v4)

lasso_coef = np.abs(lasso_cv.coef_)

plt.bar(df_train_feats_non_churn_v4.columns, lasso_coef)
plt.xticks(rotation=90)
plt.grid()
plt.title("Feature Selection Based on Lasso")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()

NameError: name 'plt' is not defined

In [10]:
selected_features =np.array(df_train_feats_non_churn_v4.columns)[lasso_coef>0.001]
selected_features

array(['receipt_count', 'sum_delta_min', 'sum_delta_max', 'sum_delta_std',
       'abs_sum_delta_mean', 'revenue_sum_max_7d', 'revenue_sum_mean_7d',
       'revenue_sum_max_14d', 'revenue_sum_mean_14d',
       'revenue_sum_max_28d', 'revenue_sum_mean_28d', 'minutes_mean',
       'minutes_std', 'minutes_sem', 'dom_median', 'first_half_mean',
       'first_half_max', 'first_half_min', 'first_half_std',
       'second_half_max', 'second_half_min', 'second_half_std'],
      dtype=object)

In [12]:
model = GradientBoostingRegressor()
scores = cross_val_score(model, X_train_v4, y_train_v4, scoring='neg_mean_squared_error', cv=5)
scores

KeyboardInterrupt: 

In [13]:
model500 = GradientBoostingRegressor(n_estimators=500)
scores500 = cross_val_score(model, X_train_v4, y_train_v4, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
scores500

array([-175.35330142, -179.49944418, -177.25305834, -179.72122793,
       -176.35783389])

In [28]:
model500.fit(X_train_v4, y_train_v4)

In [29]:
model500

In [14]:
np.sqrt(np.abs(scores500.mean()))

13.328052113955835

In [30]:
df_test = pd.read_parquet('test_with_feats_v4.parquet')

In [35]:
CLASSIFICATION_FILE = 'top37_by_imp_feats_lgbm_52cut.csv'
df_classification_results = pd.read_csv(CLASSIFICATION_FILE, sep=';')
df_test_non_churn = pd.merge(df_classification_results, df_test, on=['customer_id'])
df_test_non_churn = df_test_non_churn.fillna(0).drop(columns=['buy_post', 'customer_id'])

In [36]:
predict_result = model500.predict(df_test_non_churn)

In [39]:
df_overall_result = df_classification_results.copy()

In [40]:
df_overall_result['regression_predict'] = predict_result

In [42]:
df_overall_result['date_diff_post'] = df_overall_result['regression_predict'] * df_overall_result['buy_post']

In [61]:
df_overall_result.drop(columns=['regression_predict']).to_csv("final_submission.csv", sep=';', index=False, decimal='.')

In [62]:
predict_result

array([29.21234441, 23.81325228, 10.42466116, ..., 21.91439298,
        5.66713813, 23.15969718])

In [63]:
df_rrrr = pd.read_csv('test_pred_submission.csv', sep=';')

In [64]:
df_rrrr

Unnamed: 0,customer_id,buy_post,date_diff_post
0,52341,0,0.000000
1,69175,1,23.813252
2,73427,1,10.424661
3,134577,0,0.000000
4,156357,0,0.000000
...,...,...,...
112329,46540272,0,0.000000
112330,46569275,0,0.000000
112331,46601088,0,0.000000
112332,46632765,0,0.000000


In [59]:
df_rrrr.date_diff_post - df_overall_result.date_diff_post

0         0.000000e+00
1         0.000000e+00
2         1.776357e-15
3         0.000000e+00
4         0.000000e+00
              ...     
112329    0.000000e+00
112330    0.000000e+00
112331    0.000000e+00
112332    0.000000e+00
112333    0.000000e+00
Name: date_diff_post, Length: 112334, dtype: float64