In [None]:
import pandas as pd
import numpy as np
import sklearn
pd.set_option('display.max_colwidth', 1000)

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/data-out.csv", index_col=0)
df.dropna(how='all', inplace=True)

classified = df[~df['class'].isna()]

# Keep jobs with no class
unclassified = df[df['class'].isna()] 

# Ignore "class" column (it's empty)
unclassified = unclassified.iloc[:, :-1] 

X = classified.iloc[:,:-1]
y = classified.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

features = ['budget', 'category2', 'subcategory2' , 'job_type', 'duration', 'date_created', 'skills', 'client.feedback', 'client.reviews_count', 'client.jobs_posted', 'client.payment_verification_status', 'client.past_hires', 'client.country']

In [None]:
classified['class'].value_counts()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.stem.snowball import SnowballStemmer

#nltk.download()

stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    """Source: building Machine Learning Systems with Python, 2nd ed."""
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD

stemmedVectorizer = StemmedCountVectorizer(lowercase=True, stop_words='english', analyzer='word', ngram_range=(2, 2))
transformer = TfidfTransformer(use_idf=True)


pipeline = Pipeline([
    # Use ColumnTransformer to combine the features from subject and body
    ('union', ColumnTransformer(
        [
            # budget column
            ('budget', StandardScaler(), ['budget']),

            # snippet column
            ('snippet_vec', Pipeline([
                ('stemVec', stemmedVectorizer),
                ('tfidf', transformer),
                ('best', TruncatedSVD(n_components=50)),
            ]), 'snippet'),
        ]
    )),

    # Classifier
    ('svc', LinearSVC(dual=False)),
], verbose=True)

text_clf = pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
y_pred = text_clf.predict(X_test)
target_names = ['class 0', 'class 1', 'class 2']
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).round(2).T

In [None]:
#
# Predict new jobs
#

unclassified['predicted'] = text_clf.predict(unclassified)

In [None]:
unclassified['predicted'].value_counts()

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)