# 创建自定义转换器

In [1]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [26]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

### Implement the StartingVerbExtractor class

In [39]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        # tokenize by sentences
        sentence_list = nltk.sent_tokenize(text)
        
        for sentence in sentence_list:
            # tokenize each sentence into words and tag part of speech
            pos_tags = nltk.pos_tag(word_tokenize(sentence))

            # index pos_tags to get the first word and part of speech tag
            first_word, first_tag = pos_tags[0]
            
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True

            return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
#         X_tagged = [self.starting_verb(x) for x in X]
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [8]:
X, y = load_data()

In [45]:
[prueba.transform(x) for x in X]

[       0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,       0
 0  True,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,       0
 0  True,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,       0
 0  True,       0
 0  True,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,        0
 0  False,       0
 0  True,       0
 0  True,        0
 0  False,        0
 0

In [47]:
pd.Series(X).apply(prueba.transform)

0              0
0  False
1              0
0  False
2              0
0  False
3              0
0  False
4              0
0  False
5              0
0  False
6              0
0  False
7              0
0  False
8              0
0  False
9              0
0  False
10             0
0  False
11             0
0  False
12             0
0  False
13             0
0  False
14             0
0  False
15             0
0  False
16             0
0  False
17             0
0  False
18             0
0  False
19             0
0  False
20             0
0  False
21             0
0  False
22              0
0  True
23             0
0  False
24             0
0  False
25             0
0  False
26             0
0  False
27             0
0  False
28             0
0  False
29             0
0  False
              ...        
2373           0
0  False
2374            0
0  True
2375            0
0  True
2376           0
0  False
2377           0
0  False
2378           0
0  False
2379           0
0  False
2380        

In [41]:
prueba = StartingVerbExtractor()
prueba.transform(X)

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,False


In [22]:
a = sent_tokenize(X[0])
pos_tags = nltk.pos_tag(word_tokenize(str(a)))

In [23]:
pos_tags

[('[', 'NN'),
 ("'Barclays", 'VBZ'),
 ('CEO', 'NNP'),
 ('stresses', 'VBZ'),
 ('the', 'DT'),
 ('importance', 'NN'),
 ('of', 'IN'),
 ('regulatory', 'JJ'),
 ('and', 'CC'),
 ('cultural', 'JJ'),
 ('reform', 'NN'),
 ('in', 'IN'),
 ('financial', 'JJ'),
 ('services', 'NNS'),
 ('at', 'IN'),
 ('Brussels', 'NNP'),
 ('conference', 'NN'),
 ('http', 'NN'),
 (':', ':'),
 ('//t.co/Ge9Lp7hpyG', 'NN'),
 ("'", "''"),
 (']', 'NN')]

### Run program to test

In [40]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def model_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', RandomForestClassifier())
    ])

    return pipeline


def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = model_pipeline()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)

main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 93   1  16]
 [  0  18   5]
 [ 13   0 455]]
Accuracy: 0.941763727121
