In [0]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import *
from sklearn.metrics import *
from nltk.corpus import stopwords
from string import punctuation
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier,  BaggingClassifier, BaggingRegressor, RandomTreesEmbedding,GradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from nltk import word_tokenize
import nltk
import gensim
from sklearn.pipeline import FeatureUnion
import itertools

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
noise = stopwords.words('english') + list(punctuation)

In [0]:
data_url = 'https://raw.githubusercontent.com/TatianaShavrina/hse_ml_m1/master/ensembles/complaints.csv'
data = pd.read_csv(data_url, sep='\t')
data.head()
y = data["PRODUCT_ID"]
X = data["cleaned_text"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Voting Classifier и FeatureUnion + препроцессинг

In [0]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import FunctionTransformer

In [6]:
sno = nltk.stem.SnowballStemmer('english')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def stem_tokenizer(text):
    return [sno.stem(t) for t in word_tokenize(text)]

In [8]:
model = gensim.models.Word2Vec(X_train, size=100)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 100

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

  


In [0]:
clf1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

voting = Pipeline([
    ('feats', FeatureUnion([
        ('tfidf', TfidfVectorizer(ngram_range=(1,3), analyzer='word', max_features=200)), 
        ('tfidf2', TfidfVectorizer(tokenizer=stem_tokenizer)),
        ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
        ('vect', CountVectorizer(tokenizer=stem_tokenizer, analyzer='word', max_features=200)),
        ])),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', eclf),
    ])

voting = voting.fit(X_train, y_train)
predictions = voting.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision:   0.69
Recall:   0.67
F1-measure:   0.67
Accuracy:   0.67


### Bagging

In [0]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [0]:
count_vect = CountVectorizer(tokenizer=stem_tokenizer, max_features=1000, stop_words=noise, min_df=0.01, max_df=0.5) 

In [12]:
X = count_vect.fit_transform(X)

  'stop_words.' % sorted(inconsistent))


In [0]:
clf1 = RandomForestClassifier(n_estimators=50, random_state=1)  
clf2 = GradientBoostingClassifier(n_estimators=50, random_state=1)
clf3 = ExtraTreesClassifier(n_estimators=50, random_state=1)

In [0]:
bagging1 = BaggingClassifier(base_estimator=clf1, n_estimators=10, max_samples=0.8, max_features=0.8)
bagging2 = BaggingClassifier(base_estimator=clf2, n_estimators=10, max_samples=0.8, max_features=0.8)
bagging3 = BaggingClassifier(base_estimator=clf3, n_estimators=10, max_samples=0.8, max_features=0.8)

In [0]:
label = ['RandomForestClassifier', 'GradientBoostingClassifier', 'ExtraTreesClassifier', 'Bagging RF', 'Bagging GradientBoosting', 'Bagging ExtraTrees']
clf_list = [clf1, clf2, clf3, bagging1, bagging2, bagging3]

In [21]:
for clf, label in zip(clf_list, label):      
    scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))

Accuracy: 0.60 (+/- 0.01) [RandomForestClassifier]
Accuracy: 0.62 (+/- 0.01) [GradientBoostingClassifier]
Accuracy: 0.60 (+/- 0.01) [ExtraTreesClassifier]
Accuracy: 0.63 (+/- 0.02) [Bagging RF]
Accuracy: 0.62 (+/- 0.01) [Bagging GradientBoosting]
Accuracy: 0.63 (+/- 0.01) [Bagging ExtraTrees]


### Stacking

In [9]:
from mlxtend.classifier import StackingClassifier



In [0]:
clf1 = RandomForestClassifier(n_estimators=50, random_state=1)  
clf2 = SGDClassifier()
clf3 = ExtraTreesClassifier(n_estimators=50, random_state=1)
gradboost = GradientBoostingClassifier(n_estimators=50, random_state=1)
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=gradboost)

In [0]:
label = ['RandomForest', 'SGD', 'Extra Trees', 'Stacking Classifier']
clf_list = [clf1, clf2, clf3, sclf]

In [15]:
clf_cv_mean = []
clf_cv_std = []
for clf, label in zip(clf_list, label):
        
    scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())

Accuracy: 0.60 (+/- 0.01) [RandomForest]
Accuracy: 0.57 (+/- 0.02) [SGD]
Accuracy: 0.60 (+/- 0.01) [Extra Trees]
Accuracy: 0.61 (+/- 0.01) [Stacking Classifier]
