In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from itertools import product


In [73]:
from sklearn.datasets import load_diabetes

data = load_diabetes(as_frame=True)
print(data.DESCR)
df = data.frame
df.head()

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [74]:
X = df.drop(columns='target')
y = df['target']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [76]:
l_reg = LinearRegression()
l_reg.fit(X_train, y_train)
y_tr = l_reg.predict(X_train)
y_te = l_reg.predict(X_test)

In [77]:
l_reg.score(X_train, y_train)

0.5244132008226972

In [78]:
l_reg.score(X_test, y_test)

0.47729201741573335

In [79]:
mean_squared_error(y_te, y_test)

2821.738559584376

In [80]:
mean_squared_error(y_tr, y_train)

2924.041678907289

In [81]:
r2_score(y_test, y_te)

0.47729201741573335

In [82]:
r2_score(y_train, y_tr)

0.5244132008226972

In [83]:
cross_val_score(l_reg, X_train, y_train, scoring='r2').mean()

0.4522836364503492

In [84]:
from sklearn.linear_model import Lasso

In [85]:
alph = 1.1
for eps in range(11):
    alph = alph - 0.1 * eps
    y_pred = Lasso(alpha=alph).fit(X_train, y_train).predict(X)
    print(alph)
    alph = 1.1
    print(mean_squared_error(y, y_pred))

1.1
3953.5594461959577
1.0
3803.729431282657
0.9000000000000001
3668.489719384121
0.8
3534.4750206822105
0.7000000000000001
3409.6842606482232
0.6000000000000001
3303.193597297873
0.5
3215.000711314314
0.4
3145.0979456838922
0.30000000000000004
3070.07203087414
0.20000000000000007
2991.4778839696187
0.10000000000000009
2926.515232954724


Итого best_estimator_ = 0.1 , best_score_ = 2926

In [86]:
best_estimator_ = 0.1

In [87]:
model_01 = Lasso(alpha=best_estimator_).fit(X_train, y_train)
predict_test_01 = model_01.predict(X_test)
r2_score(y_test, predict_test_01)

0.48592037238119035

In [88]:
from sklearn.preprocessing import StandardScaler
poly = PolynomialFeatures(2)
X_pol = poly.fit_transform(X)
X_train_pol, X_test_pol, y_train_pol, y_test_pol = train_test_split(X_pol, y, test_size=0.3, random_state=42)
l_reg.fit(X_train_pol, y_train_pol)
cross_val_score(l_reg, X_pol, y, scoring='r2')

y_tr_pol = l_reg.predict(X_train_pol)
y_te_pol = l_reg.predict(X_test_pol)

In [89]:
mean_squared_error(y_test_pol, y_te_pol)

3174.7138607210736

In [90]:
mean_squared_error(y_train_pol, y_tr_pol)

2409.2503292079177

In [91]:
r2_score(y_test_pol, y_te_pol)

0.41190573032242683

In [92]:
r2_score(y_train_pol, y_tr_pol)

0.6081425033199104

In [105]:
from sklearn.linear_model import Ridge
model_01 = Ridge(alpha=0.105).fit(X_train, y_train)
predict_test_01 = model_01.predict(X_test)
r2_score(y_test, predict_test_01)

0.4803536402368126

неплохо получилось я считаю ИМХО

In [141]:
sales = pd.read_csv('sales_train.csv.gz')
sales.columns = ['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'target']
sales

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,target
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,10.10.2015,33,25,7409,299.00,1.0
2935845,09.10.2015,33,25,7460,299.00,1.0
2935846,14.10.2015,33,25,7459,349.00,1.0
2935847,22.10.2015,33,25,7440,299.00,1.0


In [142]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'target':'sum'})

#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

In [233]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle = False)
all_data['item_target_enc'] = 0
for i, (train_index, test_index) in enumerate(kf.split(all_data)):
    item_id_target_mean = all_data.iloc[train_index].groupby('item_id').target.mean()
    all_data['item_target_enc'].iloc[test_index] = all_data['item_id'].iloc[test_index].map(item_id_target_mean)
all_data.item_target_enc.fillna(all_data.target.mean(), inplace=True)
    

encoded_feature_1 = all_data.item_target_enc.values

In [234]:
# You will need to compute correlation like that
corr = np.corrcoef(all_data['target'].values, encoded_feature_1)[0][1]
print(corr)

0.41645904885335416


LoO

In [195]:
item_id_target_sum = all_data.groupby('item_id')['target'].sum()
all_data.item_target_enc = 0
nrows_data = all_data.groupby('item_id')['target'].count()
all_data['item_target_enc'] = (all_data['item_id'].map(item_id_target_sum) - all_data.target) / (all_data['item_id'].map(nrows_data) - 1)
all_data.item_target_enc.fillna(all_data.target.mean(), inplace=True)

encoded_feature = all_data.item_target_enc.values

In [196]:
corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

0.48038483112924935


Smoothing

In [200]:
nrows_data = all_data.groupby('item_id')['target'].count().rename({'target': 'cnt'})
means_data = all_data.groupby('item_id')['target'].mean().rename({'target': 'mean'})
all_data['nrows'] = all_data['item_id'].map(nrows_data)
all_data['means_data'] = all_data['item_id'].map(means_data)
    
all_data['item_target_enc'] = (all_data.means_data * all_data.nrows + all_data.target.mean() * 100) / (all_data.nrows + 100)
encoded_feature_2 = all_data.item_target_enc.values

In [201]:
corr = np.corrcoef(all_data['target'].values, encoded_feature_2)[0][1]
print(corr)

0.481819889226897


Exp Mean

In [204]:
cum_sum = all_data.groupby('item_id')["target"].cumsum() - all_data["target"]
cum_count = all_data.groupby('item_id').cumcount()
    
all_data['item_target_enc'] = cum_sum / cum_count
all_data.item_target_enc.fillna(all_data.target.mean(), inplace=True)

encoded_feature_3 = all_data.item_target_enc.values

In [205]:
corr = np.corrcoef(all_data['target'].values, encoded_feature_3)[0][1]
print(corr)

0.5025245189907205
