In [1]:
import pandas as pd
import numpy as np
import sklearn
import nltk

pd.set_option('display.max_colwidth', 1000)

In [2]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/data.csv", index_col=0)
y = pd.read_csv("data/labels.csv", index_col=0)

# Drop NAs
df.dropna(how='all', inplace=True)
y.dropna(how='all', inplace=True)

# The labeled data
X = df.loc[y.index]

# The unlabeled data
unlabeled = df.loc[~df.index.isin(y.index)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [3]:
print(f"{X_train.shape[0]} training examples and {X_test.shape[0]} validation examples.")

108 training examples and 37 validation examples.


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

import string

import scipy 
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transforms input data by using NLTK tokenization, lemmatization, and
    other normalization and filtering techniques.
    """

    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor, which make load corpora, models, or do
        other time-intenstive NLTK data loading.
        """
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        """
        Fit simply returns self, no other information is needed.
        """
        return self

    def inverse_transform(self, X):
        """
        No inverse transformation
        """
        return X

    def transform(self, X):
        """
        Actually runs the preprocessing on each document.
        """
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)



In [5]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

#
# Tokenization and stemming
#

stemmer = SnowballStemmer("english", ignore_stopwords=True)

# Note: punctuation is completely ignored and always treated as a token separator by CountVectorizer
class StemmedCountVectorizer(CountVectorizer): 
    """Source: building Machine Learning Systems with Python, 2nd ed."""

    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD

stemmedVectorizer = StemmedCountVectorizer(lowercase=True,
                                           stop_words='english',
                                           analyzer='word',
                                           # tokenizer=,
                                           ngram_range=(2, 2),
                                           )

def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

pipeline = Pipeline([
    # Use ColumnTransformer to combine the features from subject and body
    ('union', ColumnTransformer(
        [
            # budget column
            ('budget', StandardScaler(), ['budget']),

            # ('word_count', word_counter, ['snippet']),

            # snippet column
            ('snippet_vec', Pipeline([
                #('stemVec', stemmedVectorizer),
                ('preprocessor', NLTKPreprocessor()),
                ('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, use_idf=True)),
                #('best', TruncatedSVD(n_components=50)),
            ]), 'snippet'),
        ]
    )),

    # Classifier
    ('svc', LinearSVC(dual=False)),
], verbose=True)

text_clf = pipeline.fit(X_train, y_train.values.ravel())

[Pipeline] ............. (step 1 of 2) Processing union, total=   1.2s
[Pipeline] ............... (step 2 of 2) Processing svc, total=   0.0s


In [12]:
from sklearn.metrics import classification_report

y_pred = text_clf.predict(X_test)
target_names = ['class 0', 'class 1', 'class 2']
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).round(2).T

Unnamed: 0,precision,recall,f1-score,support
Bad,0.76,0.72,0.74,18.0
Good,0.47,0.69,0.56,13.0
Maybe,0.0,0.0,0.0,6.0
accuracy,0.59,0.59,0.59,0.59
macro avg,0.41,0.47,0.44,37.0
weighted avg,0.54,0.59,0.56,37.0


In [8]:
#
# Predict new jobs
#

unlabeled['predicted'] = text_clf.predict(unlabeled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
unlabeled['predicted'].value_counts()

Bad      1962
Good      633
Maybe      18
Name: predicted, dtype: int64

In [11]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)