Сначала загрузим все, что нужно и подготовим выборки для обучения моделей.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from vecstack import stacking
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
%matplotlib inline
lr = LogisticRegression(random_state=42)
dtc = DecisionTreeClassifier(random_state=42)
rfc = RandomForestClassifier(random_state=42,n_jobs=-1)
cv = CountVectorizer()
train = pd.read_csv('data/train.csv',sep='\t')
train['text'] = train['name']+' '+train['description']
texts_without_tags = []
for t in train['text']:
    texts_without_tags.append(
        BeautifulSoup(t, 'lxml').text
    )
m = cv.fit_transform(texts_without_tags)
test_size=0.3
X_train, X_test, y_train, y_test = train_test_split(m, train['target'], test_size=0.3, random_state=42)



In [2]:
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)

X_train:  (140000, 208984)
X_test:  (60000, 208984)
y_train:  (140000,)
y_test:  (60000,)


Теперь подготовим данные для предсказания:

In [3]:
test = pd.read_csv('data/test.csv',sep='\t')
test['text'] = test['name']+' '+test['description']
texts_without_tags_1 = []
for t in test['text']:
    texts_without_tags_1.append(
        BeautifulSoup(t, 'lxml').text
    )
X = cv.transform(texts_without_tags_1)

In [4]:
models = [
    RandomForestClassifier(n_estimators=500, max_depth=5, random_state=123,),
    RandomForestClassifier(n_estimators=500, max_depth=10, random_state=123, max_features='log2'),
    LogisticRegression(random_state=42,penalty='l2',C = 0.1),
    DecisionTreeClassifier(random_state=42),   
    GradientBoostingClassifier(n_estimators=300, max_depth=3,learning_rate=0.02,random_state=123),
    ]

In [5]:
def roc_auc_score_cust(y_true, y_hat):
    return roc_auc_score(y_true, y_hat[:,1])

In [7]:
S_train, S_valid = stacking(models,
                              # X_train, y_train, X_test,
                               m,train['target'],X,
                               regression=False,
                               mode='oof_pred_bag', 
                               needs_proba=True,
                               metric=roc_auc_score_cust,
                               n_folds=5,                
                               stratified=True,          
                               shuffle=True,             
                               random_state=123,         
                               verbose=2) 
last_model = GradientBoostingClassifier(n_estimators=300, max_depth=3,
                                       learning_rate=0.01, 
                                       random_state=123)
#last_model.fit(S_train, y_train)
last_model.fit(S_train, train['target'])
y_hat = last_model.predict_proba(S_valid)
#roc_auc_score(y_test, y_hat[:, 1])

task:       [classification]
n_classes:  [2]
metric:     [roc_auc_score_cust]
mode:       [oof_pred_bag]
n_models:   [5]

model 0:    [RandomForestClassifier]
    fold 0: [0.96782805]
    fold 1: [0.96589542]
    fold 2: [0.96960600]
    fold 3: [0.96598574]
    fold 4: [0.97037849]
    ----
    MEAN:   [0.96793874] + [0.00182941]
    FULL:   [0.96780683]

model 1:    [RandomForestClassifier]
    fold 0: [0.96104957]
    fold 1: [0.95962370]
    fold 2: [0.96547977]
    fold 3: [0.95615592]
    fold 4: [0.96337086]
    ----
    MEAN:   [0.96113596] + [0.00319245]
    FULL:   [0.96093471]

model 2:    [LogisticRegression]
    fold 0: [0.99082913]
    fold 1: [0.99042751]
    fold 2: [0.99147016]
    fold 3: [0.99091176]
    fold 4: [0.99124611]
    ----
    MEAN:   [0.99097693] + [0.00035880]
    FULL:   [0.99097706]

model 3:    [DecisionTreeClassifier]
    fold 0: [0.94699026]
    fold 1: [0.94768906]
    fold 2: [0.94808755]
    fold 3: [0.94778695]
    fold 4: [0.94788110]
    ----


In [8]:
submission = test[['id','description']]
#submission['target'] = fin_predict  
submission['target'] = y_hat[:,0:1]
submission = submission.drop('description', 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
submission.to_csv("submission.csv",index=False)

In [10]:
zz = pd.read_csv('submission.csv',sep=';')
zz.head()

Unnamed: 0,"id,target"
0,2000000.0643999248
1,2000010.0353601936
2,2000020.0353601936
3,2000030.0353601936
4,2000040.9717401776
