In [1]:
import pandas as pd
import numpy as np
import sklearn
pd.set_option('display.max_colwidth', 1000)

In [2]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/data.csv", index_col=0)
y = pd.read_csv("data/labels.csv", index_col=0)

# Drop NAs
df.dropna(how='all', inplace=True)
y.dropna(how='all', inplace=True)

# The labeled data
X = df.loc[y.index]

# The unlabeled data
unlabeled = df.loc[~df.index.isin(y.index)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [3]:
print(f"{X_train.shape[0]} training examples and {X_test.shape[0]} validation examples.")

104 training examples and 35 validation examples.


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.stem.snowball import SnowballStemmer

#nltk.download()

stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    """Source: building Machine Learning Systems with Python, 2nd ed."""
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD

stemmedVectorizer = StemmedCountVectorizer(lowercase=True, stop_words='english', analyzer='word', ngram_range=(2, 2))
transformer = TfidfTransformer(use_idf=True)


pipeline = Pipeline([
    # Use ColumnTransformer to combine the features from subject and body
    ('union', ColumnTransformer(
        [
            # budget column
            ('budget', StandardScaler(), ['budget']),

            # snippet column
            ('snippet_vec', Pipeline([
                ('stemVec', stemmedVectorizer),
                ('tfidf', transformer),
                ('best', TruncatedSVD(n_components=50)),
            ]), 'snippet'),
        ]
    )),

    # Classifier
    ('svc', LinearSVC(dual=False)),
], verbose=True)

text_clf = pipeline.fit(X_train, y_train)

[Pipeline] ............. (step 1 of 2) Processing union, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing svc, total=   0.0s


In [6]:
from sklearn.metrics import classification_report
y_pred = text_clf.predict(X_test)
target_names = ['class 0', 'class 1', 'class 2']
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).round(2).T

Unnamed: 0,precision,recall,f1-score,support
Bad,0.62,0.94,0.74,17.0
Good,0.67,0.46,0.55,13.0
Maybe,0.0,0.0,0.0,5.0
accuracy,0.63,0.63,0.63,0.63
macro avg,0.43,0.47,0.43,35.0
weighted avg,0.55,0.63,0.56,35.0


In [7]:
#
# Predict new jobs
#

unlabeled['predicted'] = text_clf.predict(unlabeled)

In [8]:
unlabeled['predicted'].value_counts()

Bad      2165
Good      296
Maybe       7
Name: predicted, dtype: int64

In [9]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)