In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
os.chdir("/home/multifaceted/Documents/Machine_Learning/Data_Challenge2/")

In [6]:
raw_data_train = pd.read_csv("train.csv").iloc[:, 1:]

y_raw_data_train = raw_data_train.iloc[:, 0]
X_raw_data_train = raw_data_train.iloc[:, 1:]

X_raw_data_train.head()

In [7]:
# X_raw_data_train[ ["tariff.plan", "activ.area", "activ.chan"] ] = X_raw_data_train[ ["tariff.plan", "activ.area", "activ.chan"] ].astype(str)

In [11]:
X_raw_data_train_dummies = pd.get_dummies(X_raw_data_train)

zero_or_not = [X_raw_data_train.filter(regex = "q0" + str(month)).sum(axis = 1) == 0 for month in range(1, 10)]

zeros = pd.DataFrame(zero_or_not).T.sum(axis = 1)

X_raw_data_train_dummies_zeros = pd.concat([X_raw_data_train_dummies, zeros], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X_raw_data_train_dummies_zeros, y_raw_data_train, test_size=0.33, random_state=42)

In [222]:
X_raw_data_test = pd.read_csv("test.csv")
X_raw_data_test_dummies = pd.get_dummies(X_raw_data_test)
zero_or_not_test = [X_raw_data_test.filter(regex = "q0" + str(month)).sum(axis = 1) == 0 for month in range(1, 10)]
zeros_test = pd.DataFrame(zero_or_not_test).T.sum(axis = 1)
X_raw_data_test_dummies_zeros = pd.concat([X_raw_data_test_dummies, zeros_test], axis = 1)

In [17]:
parameter_result = []

In [93]:
def lgb_score(space):
    from sklearn.model_selection import train_test_split
    from sklearn.feature_selection import RFECV
    from sklearn.metrics import mean_squared_error
    from sklearn.ensemble import GradientBoostingRegressor
    
        
    X_train, X_test, y_train, y_test = train_test_split(X_raw_data_train_dummies_zeros, y_raw_data_train, test_size=0.33)
    
    gb = GradientBoostingRegressor(n_estimators=space["n_estimators"],
                                   max_features=space["max_features"],
                                   learning_rate=space["learning_rate"],
                                   subsample=space["subsample"],
                                   max_depth=space["max_depth"])
    gbcv = RFECV(gb, n_jobs = -1, cv = 5)
    gbcv.fit(X_train.filter(regex = r"^(?!vas2)"), y_train)
    
    mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)")), y_test)
    parameter_result.append((space["n_estimators"], space["max_features"], space["learning_rate"], space["subsample"], space["max_depth"], mse))
    
    if len(parameter_result) % 5 == 0:
        pd.DataFrame(parameter_result, columns = ["n_estimators", "max_features", "learning_rate", "subsample", "max_depth", "mse"]).to_csv("result.csv")
    
    return mse

In [94]:
def gbm_hyperopt(max_evals = 10000):
    from hyperopt import fmin, tpe, space_eval, hp
    import pandas as pd
      
    space = {
        "learning_rate": hp.uniform("learning_rate", .08, .12),
        "n_estimators": hp.randint("n_estimators - 100", 100) + 100,
        "max_features": hp.uniform("max_features", .25, .3),
        "subsample": hp.uniform("subsample", .8, 1),
        "max_depth": hp.randint("max_depth - 2", 4) + 2
    }
    best = fmin(neigh_score, space, algo = tpe.suggest, max_evals = max_evals)
    return best

In [96]:
result = pd.DataFrame(parameter_result, columns = ["n_estimators", "max_features", "learning_rate", "subsample", "max_depth", "mse"])

In [105]:
(n_estimators, max_features, learning_rate, subsample, max_depth) = result.head().sort_values(by = "mse").iloc[0, :-1]

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X_raw_data_train_dummies_zeros, y_raw_data_train, test_size=0.33)

gb = GradientBoostingRegressor(n_estimators=int(n_estimators) + 100,
                               max_features=max_features,
                                learning_rate=learning_rate,
                               subsample=subsample,
                               max_depth=int(max_depth) + 2)
gbcv = RFECV(gb, n_jobs = -1, cv = 5)
gbcv.fit(X_train.filter(regex = r"^(?!vas2)"), y_train)

mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)")), y_test)
print(mse)

1.2663902012457124


In [139]:
parameter_result2 = []

In [186]:
def rf_score(space):
    from sklearn.model_selection import train_test_split
    from sklearn.feature_selection import RFECV
    from sklearn.metrics import mean_squared_error
    from sklearn.ensemble import RandomForestRegressor
    
        
    X_train, X_test, y_train, y_test = train_test_split(X_raw_data_train_dummies_zeros, y_raw_data_train, test_size=0.2)
    
    rf = RandomForestRegressor(n_estimators=space["n_estimators"],
                                   max_features=space["max_features"],
                                   max_leaf_nodes=space["max_leaf_nodes"],
                                n_jobs = -1)
    rfcv = RFECV(rf, n_jobs = -1, cv = 5)
    rfcv.fit(X_train.filter(regex = r"^(?!vas2)"), y_train)
    
    mse = mean_squared_error(rfcv.predict(X_test.filter(regex = r"^(?!vas2)")), y_test)
    parameter_result2.append((space["n_estimators"], space["max_features"], space["max_leaf_nodes"], mse))
    
    if len(parameter_result2) % 5 == 0:
        pd.DataFrame(parameter_result2, columns = ["n_estimators", "max_features", "max_leaf_nodes", "mse"]).to_csv("result_rf.csv")
    
    return mse

In [187]:
def rf_hyperopt(max_evals = 10000):
    from hyperopt import fmin, tpe, space_eval, hp
    import pandas as pd
      
    space = {
        "n_estimators": hp.randint("n_estimators - 100", 100) + 100,
        "max_features": hp.uniform("max_features", .2, .5),
        "max_leaf_nodes": hp.randint("max_leaf_nodes - 80", 40) + 80
    }
    best = fmin(rf_score, space, algo = tpe.suggest, max_evals = max_evals)
    return best

In [181]:
result_rf = pd.DataFrame(parameter_result2, columns = ["n_estimators", "max_features", "max_leaf_nodes", "mse"])

In [None]:
result_rf.

In [160]:
(n_estimators, max_features, max_leaf_nodes) = result_rf.sort_values(by = "mse").iloc[0, :-1]

In [164]:
rf = RandomForestRegressor(n_estimators=int(n_estimators) + 100,
                               max_features=max_features,
                               max_leaf_nodes=int(max_leaf_nodes),
                            n_jobs = -1)
rfcv = RFECV(rf, n_jobs = -1, cv = 5)
rfcv.fit(X_train.filter(regex = r"^(?!vas2)"), y_train)

RFECV(cv=5,
   estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.27879721610656155, max_leaf_nodes=100,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=279, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
   min_features_to_select=1, n_jobs=-1, scoring=None, step=1, verbose=0)

In [165]:
mse = mean_squared_error(rfcv.predict(X_test.filter(regex = r"^(?!vas2)")), y_test)
mse

1.2270403452120056

In [168]:
max_leaf_nodes

100.0

In [193]:
attempt = result_rf.sort_values(by = "mse", ascending = True).head(10)

In [234]:
predictions = []

In [249]:
attempt.head()

Unnamed: 0,n_estimators,max_features,max_leaf_nodes,mse
133,179,0.278797,100.0,1.082332
228,127,0.320506,116.0,1.100051
240,182,0.290792,109.0,1.106774
16,105,0.227808,100.0,1.126145
100,149,0.285495,100.0,1.128334


In [235]:
for i in range(attempt.shape[0]):
    rf = RandomForestRegressor(n_estimators=int(attempt.iloc[i, 0]) + 100,
                               max_features=attempt.iloc[i, 1],
                               max_leaf_nodes=int(attempt.iloc[i, 2]) + 80,
                            n_jobs = -1)
    rfcv = RFECV(rf, n_jobs = -1, cv = 5)
    rfcv.fit(X_raw_data_train_dummies_zeros.filter(regex = r"^(?!vas2)"), y_raw_data_train)
    predictions.append(rfcv.predict(X_raw_data_test_dummies_zeros.filter(regex = r"^(?!vas2)")))
    print(i)

0
1
2
3
4
5
6
7
8
9


In [243]:
import numpy as np 

In [251]:
str(attempt.iloc[0, 3])

'1.0823321586403427'

In [253]:
for i in range(len(predictions)):
    np.savetxt("result" +  str(i) + ".csv", predictions[i], delimiter =",")

In [192]:
result_rf.to_csv("final.csv")

In [191]:
result_rf = pd.DataFrame(parameter_result2, columns = ["n_estimators", "max_features", "max_leaf_nodes", "mse"])
result_rf.sort_values(by = "mse", ascending = True).head(10)

Unnamed: 0,n_estimators,max_features,max_leaf_nodes,mse
133,179,0.278797,100.0,1.082332
228,127,0.320506,116.0,1.100051
240,182,0.290792,109.0,1.106774
16,105,0.227808,100.0,1.126145
100,149,0.285495,100.0,1.128334
220,110,0.390355,94.0,1.128870
200,135,0.417377,106.0,1.129718
237,142,0.343084,118.0,1.134274
210,125,0.489592,95.0,1.141759
161,141,0.311274,80.0,1.143822


In [188]:
rf_hyperopt(100000)

  0%|          | 62/100000 [5:30:36<7363:14:37, 265.24s/it, best loss: 1.100051489661925]


KeyboardInterrupt: 

In [None]:
pd.Dataparameter_result2

In [81]:
rf_full = RandomForestRegressor(n_estimators = 100, n_jobs = -1, bootstrap = True, max_depth=10)


108.0

In [83]:
result.head().sort_values(by = "mse")

Unnamed: 0,n_estimators,max_features,learning_rate,subsample,max_depth,mse
1,108,0.266369,0.089341,0.85502,4,1.136239
2,158,0.276264,0.09053,0.959466,4,1.164145
3,169,0.295335,0.087816,0.895625,3,1.195023
0,124,0.286254,0.096384,0.88435,2,1.272737
4,147,0.262383,0.090296,0.861005,2,1.289643


In [79]:
n_estimators

108.0

In [62]:
result.to_csv("result.csv")

In [92]:
gbm_hyperopt(10000)

  0%|          | 2/10000 [20:38<1327:10:01, 477.88s/it, best loss: 1.2193007488030403]


KeyboardInterrupt: 

In [31]:
11 % 10

1

In [14]:
gb = GradientBoostingRegressor(n_estimators = 100, max_features = .26, learning_rate = .1)
                          #max_depth = 10, max_leaf_nodes=100)
gbcv = RFECV(gb, n_jobs = -1, cv = 5)
gbcv.fit(X_train.filter(regex = r"^(?!vas2)"), y_train)

mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)")), y_test)

1.1962552971858815

In [104]:
rf = ExtraTreesRegressor(n_estimators = 100, max_features = "sqrt", n_jobs = -1 , bootstrap = True)
                           #max_depth = 10, max_leaf_nodes=100)
rfcv = RFECV(rf, n_jobs = -1, cv = 10)
rfcv.fit(X_train.filter(regex = r"^(?!vas2)"), y_train)

RFECV(cv=10,
   estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
          oob_score=False, random_state=None, verbose=0, warm_start=False),
   min_features_to_select=1, n_jobs=-1, scoring=None, step=1, verbose=0)

In [15]:
mse = mean_squared_error(rfcv.predict(X_test.filter(regex = r"^(?!vas2)")), y_test)
mse

NameError: name 'rfcv' is not defined

In [14]:
X_test.shape

(1980, 105)

In [27]:
import numpy as np

In [29]:
max_features = np.linspace(.01, .1, 20)

In [30]:
max_features

array([0.01      , 0.01473684, 0.01947368, 0.02421053, 0.02894737,
       0.03368421, 0.03842105, 0.04315789, 0.04789474, 0.05263158,
       0.05736842, 0.06210526, 0.06684211, 0.07157895, 0.07631579,
       0.08105263, 0.08578947, 0.09052632, 0.09526316, 0.1       ])

In [32]:
mses = []
for max_feature in max_features:
    gb = GradientBoostingRegressor(n_estimators = 100, max_features = max_feature, learning_rate = .1)
                               #max_depth = 10, max_leaf_nodes=100)
    gbcv = RFECV(gb, n_jobs = -1, cv = 5)
    gbcv.fit(X_train.filter(regex = r"^(?!vas2)(?!.*sms)"), y_train)
    
    mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)(?!.*sms)")), y_test)
    print(mse)
    mses.append(mse)

1.2368939221901474
1.2527636147774288
1.2487717984691873
1.2366091535483539
1.280060988472895
1.246428426353427
1.2789006363337767
1.257283164352521
1.2391139019756658
1.2465785296003757
1.2475795785732338
1.2366872958220938
1.2637478561961923
1.2384594916161151
1.2474044446017956
1.2343920428226187
1.229299789190729
1.2308959371931028
1.2308612734253426
1.2401087852136992


In [33]:
max_features = np.linspace(.1, .2, 20)

In [35]:
mses2 = []
for max_feature in max_features:
    gb = GradientBoostingRegressor(n_estimators = 100, max_features = max_feature, learning_rate = .1)
                               #max_depth = 10, max_leaf_nodes=100)
    gbcv = RFECV(gb, n_jobs = -1, cv = 5)
    gbcv.fit(X_train.filter(regex = r"^(?!vas2)(?!.*sms)"), y_train)
    
    mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)(?!.*sms)")), y_test)
    print(mse)
    mses2.append(mse)

1.2227539896477064
1.2388667092459051
1.2232423127360246
1.2365214226681585
1.3313689610328838
1.2302547424539456
1.215339152229868
1.2226463370503418
1.2042441901995338
1.230512130698362
1.2281196581262785
1.2291422124571392
1.216040678226161
1.2025085396825332
1.22375167612537
1.2144854586676717
1.2169820260138118
1.2125139018454523
1.2066291486257013
1.193594076507231


In [44]:
max_features = np.linspace(.2, .3, 20)

In [38]:
mses3 = []
for max_feature in max_features:
    gb = GradientBoostingRegressor(n_estimators = 100, max_features = .26, learning_rate = .1)
                              #max_depth = 10, max_leaf_nodes=100)
    gbcv = RFECV(gb, n_jobs = -1, cv = 5)
    gbcv.fit(X_train.filter(regex = r"^(?!vas2)(?!.*sms)"), y_train)
    
    mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)(?!.*sms)")), y_test)
    print(mse)
    mses3.append(mse)

1.2060366809506275
1.2225998435060432
1.21625730769809
1.2220675714825542
1.2081726792909646
1.2030566770162825
1.1846192296196707
1.2256008349782601
1.2066184431804077
1.191458900164761
1.2066484142863
1.19689187482369
1.1905797073554258
1.1949630099947128
1.200152039860209
1.3178397561255017
1.1967586236888215
1.181332964527593
1.2090281026592902
1.1926151534740217


In [45]:
max_features

array([0.2       , 0.20526316, 0.21052632, 0.21578947, 0.22105263,
       0.22631579, 0.23157895, 0.23684211, 0.24210526, 0.24736842,
       0.25263158, 0.25789474, 0.26315789, 0.26842105, 0.27368421,
       0.27894737, 0.28421053, 0.28947368, 0.29473684, 0.3       ])

In [42]:

gb = GradientBoostingRegressor(n_estimators = 200, max_features = 1.0, learning_rate = .2)
                           #max_depth = 10, max_leaf_nodes=100)
gbcv = RFECV(gb, n_jobs = -1, cv = 5)
gbcv.fit(X_train.filter(regex = r"^(?!vas2)(?!.*sms)"), y_train)

mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)(?!.*sms)")), y_test)
print(mse)

1.2530954618601224


In [22]:
gb = GradientBoostingRegressor(n_estimators = 100, max_features = "sqrt", learning_rate = .1)
                           #max_depth = 10, max_leaf_nodes=100)
gbcv = RFECV(gb, n_jobs = -1, cv = 5)
gbcv.fit(X_train.filter(regex = r"^(?!vas2)(?!.*sms)"), y_train)

RFECV(cv=5,
   estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.15, loss='ls', max_depth=3,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_sa...       subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False),
   min_features_to_select=1, n_jobs=-1, scoring=None, step=1, verbose=0)

In [24]:
mse = mean_squared_error(gbcv.predict(X_test.filter(regex = r"^(?!vas2)(?!.*sms)")), y_test)
mse

1.2136688290925985

In [23]:
rfcv.ranking_

array([ 1,  1, 18, 21, 49, 29, 42, 70, 75, 80, 62, 69, 93, 89, 50, 64, 44,
       57, 83, 86, 73, 60, 90, 85, 39, 26, 19, 72, 56, 71, 59, 61, 91, 82,
       36,  8, 17, 65, 63, 79, 46, 52, 92, 78, 25,  4, 14, 13, 34, 55, 32,
       43, 87, 84, 11,  1,  9, 41, 48, 38, 27, 40, 77, 76,  6,  1,  1, 24,
       23, 28, 22, 30, 68, 67, 10,  3,  1,  2,  7, 12, 16, 15, 53, 54,  1,
        1,  1,  1,  1,  1,  5,  1, 37, 45, 47, 33, 66, 31, 74, 51, 35, 58,
       81, 88, 20])

In [24]:
X_train.shape

(4020, 105)

In [37]:
c.tail(50)

Unnamed: 0,name,ranking
16,q02.out.val.peak,44
93,q09.ch.cc,45
40,q04.in.ch.tot,46
94,payment.method_bank account,47
58,q06.out.dur.offpeak,48
4,q01.out.ch.peak,49
14,q02.out.ch.peak,50
99,gender_M,51
41,q04.in.dur.tot,52
82,q08.ch.sms,53


In [84]:
list(X_train.filter(regex = r"^(?!vas2)(?!.*sms)").columns)

['tariff.plan',
 'age',
 'activ.area',
 'activ.chan',
 'q01.out.ch.peak',
 'q01.out.dur.peak',
 'q01.out.val.peak',
 'q01.out.ch.offpeak',
 'q01.out.dur.offpeak',
 'q01.out.val.offpeak',
 'q01.in.ch.tot',
 'q01.in.dur.tot',
 'q01.ch.cc',
 'q02.out.ch.peak',
 'q02.out.dur.peak',
 'q02.out.val.peak',
 'q02.out.ch.offpeak',
 'q02.out.dur.offpeak',
 'q02.out.val.offpeak',
 'q02.in.ch.tot',
 'q02.in.dur.tot',
 'q02.ch.cc',
 'q03.out.ch.peak',
 'q03.out.dur.peak',
 'q03.out.val.peak',
 'q03.out.ch.offpeak',
 'q03.out.dur.offpeak',
 'q03.out.val.offpeak',
 'q03.in.ch.tot',
 'q03.in.dur.tot',
 'q03.ch.cc',
 'q04.out.ch.peak',
 'q04.out.dur.peak',
 'q04.out.val.peak',
 'q04.out.ch.offpeak',
 'q04.out.dur.offpeak',
 'q04.out.val.offpeak',
 'q04.in.ch.tot',
 'q04.in.dur.tot',
 'q04.ch.cc',
 'q05.out.ch.peak',
 'q05.out.dur.peak',
 'q05.out.val.peak',
 'q05.out.ch.offpeak',
 'q05.out.dur.offpeak',
 'q05.out.val.offpeak',
 'q05.in.ch.tot',
 'q05.in.dur.tot',
 'q05.ch.cc',
 'q06.out.ch.peak',
 'q06.

In [75]:
list(X_train.filter(regex = r"[(sms)]").columns)

['q01.ch.sms',
 'q02.ch.sms',
 'q03.ch.sms',
 'q04.ch.sms',
 'q05.ch.sms',
 'q06.ch.sms',
 'q07.ch.sms',
 'q08.ch.sms',
 'q09.ch.sms',
 'payment.method_bank account',
 'payment.method_credit card',
 'payment.method_post account',
 'vas1_N',
 'vas1_Y',
 'vas2_N',
 'vas2_Y']

In [27]:
c = pd.DataFrame({"name": X_train.columns, "ranking": rfcv.ranking_}).sort_values(by = "ranking")

In [28]:
c["name"].str.contains("sms")

0      False
1      False
55     False
91     False
65     False
89     False
88     False
87     False
86     False
85     False
84     False
66     False
76     False
77     False
75     False
45     False
90     False
64     False
78     False
35     False
56     False
74     False
54     False
79     False
47     False
46     False
81     False
80     False
36     False
2      False
       ...  
15     False
37     False
96     False
73     False
72      True
11     False
7      False
29     False
27     False
20     False
98     False
8      False
63     False
62      True
43     False
39     False
9      False
102    False
33     False
18     False
53     False
23     False
19     False
52      True
103    False
13     False
22      True
32      True
42      True
12      True
Name: name, Length: 105, dtype: object