In [69]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/grigoriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Based on <br>
https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelinesm <br>
http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

In [50]:
SEED = 42

In [51]:
# https://scikit-learn.org/0.18/auto_examples/hetero_feature_union.html

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

### Data downloading

In [52]:
df = pd.read_csv('./Data/train.csv', index_col = 'id')
print(df.isna().sum())

text      0
author    0
dtype: int64


In [53]:
df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


### Base text data preparations

In [57]:
stopWords = set(stopwords.words('english'))

def text_processing(df):
    #lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

    #numerical feature engineering
    df['length'] = df['processed'].apply(lambda x: len(x))
    df['words_count'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_non_stpw_count'] = df['processed'].apply(lambda x: len([word for word in x.split(' ') 
                                                                           if word not in stopWords ]))
    df['avg_words_non_stpw_length'] = df['processed'].apply(lambda x: np.mean([len(word) for word in x.split(' ') 
                                                                              if word not in stopWords ]) 
                                                                      if len(set(x.split(' ')) - stopWords)!= 0 else 0) # "len(set(x.split(' ')) - stopWords)!= 0" control that string has at least one not stop word 
    df['commas_count'] = df['text'].apply(lambda x: x.count(','))
    return df

df = text_processing(df)
df.head(3)

Unnamed: 0_level_0,text,author,processed,length,words_count,words_non_stpw_count,avg_words_non_stpw_length,commas_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4


 ### Setting up Pipeline

In [58]:
features = [column_name for column_name in df.columns if column_name not in ['id', 'text', 'author']]
num_features_names = [column_name for column_name in df.columns if column_name not in ['id', 'text', 
                                                                                 'author', 'processed']]

In [59]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df['author'], 
                                                    test_size = 0.3, random_state = SEED)

In [60]:
print(X_train.shape, X_test.shape)

(13705, 6) (5874, 6)


**Build up main parts of pipeline**

In [61]:
text_features = Pipeline([
    ('selector', TextSelector(key = 'processed')),
    ('tfidf', TfidfVectorizer(stop_words = 'english'))
])

text_features.fit_transform(X_train)

<13705x21869 sparse matrix of type '<class 'numpy.float64'>'
	with 154700 stored elements in Compressed Sparse Row format>

In [62]:
num_features_names

['length',
 'words_count',
 'words_non_stpw_count',
 'avg_words_non_stpw_length',
 'commas_count']

In [63]:
length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])
words =  Pipeline([
                ('selector', NumberSelector(key='words_count')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_non_stpw_count')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_words_non_stpw_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas_count')),
                ('standard', StandardScaler()),
            ])

# С этим явно нужно что-то делать

In [64]:
print(np.isnan(length.fit_transform(X_train)).sum())
print(np.isnan(words.fit_transform(X_train)).sum())
print(np.isnan(words_not_stopword.fit_transform(X_train)).sum())
print(np.isnan(avg_word_length.fit_transform(X_train)).sum())
print(np.isnan(commas.fit_transform(X_train)).sum())

0
0
0
0
0


In [65]:
feats = FeatureUnion([('text', text_features), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<13705x21874 sparse matrix of type '<class 'numpy.float64'>'
	with 223225 stored elements in Compressed Sparse Row format>

**Result**

In [68]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = SEED)),
])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
np.mean(preds == y_test) #accuracy



0.6426625808648281

**GridSearchCV**

In [76]:
print(list(pipeline.get_params().keys())[:10])
print('Awerall parameters amount:', len(list(pipeline.get_params().keys())))

['memory', 'steps', 'verbose', 'features', 'classifier', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__text']
Awerall parameters amount: 104


In [78]:
hyperparameters = { 'features__text__tfidf__max_df': [0.9, 0.95],
                    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
                   'classifier__max_depth': [50, 70],
                    'classifier__min_samples_leaf': [1,2]
                  }
clf = GridSearchCV(pipeline, hyperparameters, cv=5, verbose = 0)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('text',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('selector',
                                                                                         TextSelector(key='processed')),
                                                                                        ('tfidf',
                                                                                         TfidfVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                  

In [80]:
clf.refit

tune_preds = clf.predict(X_test)
np.mean(tune_preds == y_test)

0.6355124276472591

### Predictions

In [89]:
sub_example = pd.read_csv('./Data/sample_submission.csv')
sub_example.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


In [86]:
test_data = pd.read_csv('./Data/test.csv')

In [87]:
#preprocessing
submission = text_processing(test_data)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([test_data['id'], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.258578,0.273729,0.467693
id24541,0.636242,0.155624,0.208134
id00134,0.357282,0.482661,0.160057
id27757,0.567779,0.210418,0.221803
id04081,0.598634,0.172221,0.229145
