In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import preprocessing as pps



In [2]:
df = pd.read_csv("../../data/All_Data.csv", index_col=0)
df.shape

(1984, 8)

In [3]:
titles = pd.DataFrame(df['jobtitle'].value_counts())
titles = titles.sort_values(by='jobtitle', ascending = False)
titles

Unnamed: 0,jobtitle
Business Analyst,401
Product Manager,352
Data Analyst,282
Data Scientist,276
Database Administrator,273
Data Engineer,205
Data Architect,171
Machine Learning,24


In [None]:
titles.to_csv("../metrics/title_distribution.csv")

In [4]:
df = df[df["jobtitle"] != 'Machine Learning']
#df = df[df["jobtitle"] != 'Data Architect']
#df = df[df["jobtitle"] != 'Data Analyst']
#df = df[df["jobtitle"] != 'Data Engineer']

In [5]:
list(set(df['jobtitle']))

['Product Manager',
 'Data Engineer',
 'Data Analyst',
 'Data Scientist',
 'Data Architect',
 'Database Administrator',
 'Business Analyst']

In [22]:
cleaned = pps.raw_cleaning(df['snippet'], noun = False)
vectorizer = TfidfVectorizer(max_df = 0.99, min_df = 0.01, ngram_range=(1,2))
X = vectorizer.fit_transform(cleaned).toarray()

In [26]:
y = df['jobtitle']

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1, max_depth = None, n_estimators = 200, min_impurity_split=1e-07)
clf3 = GaussianNB()
clf4 = DecisionTreeClassifier(max_depth=4)
clf5 = KNeighborsClassifier(n_neighbors=7)
clf6 = SVC(kernel='rbf', probability=True)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('knn', clf4)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, clf4,clf5, clf6, eclf], ['Logistic Regerssion','Random Forest', 'naive Bayes', 'K neighbours',' Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.66 (+/- 0.01) [Logistic Regerssion]
Accuracy: 0.65 (+/- 0.02) [Random Forest]
Accuracy: 0.39 (+/- 0.02) [naive Bayes]
Accuracy: 0.42 (+/- 0.02) [K neighbours]
Accuracy: 0.58 (+/- 0.01) [ Ensemble]


In [31]:
clf2.fit(X, y)
words = list(vectorizer.vocabulary_.keys())
features = pd.DataFrame(words, columns=['word'])
features['importance'] = clf2.feature_importances_
features.sort_values(by='importance', ascending=False, inplace=True)
features.head()

Unnamed: 0,word,importance
169,consulting,0.037538
550,optimize,0.0362
190,growing,0.030633
85,role,0.0224
32,improvement,0.010254


In [32]:
features.to_csv("../metrics/feature_importance.csv")

In [33]:
features

Unnamed: 0,word,importance
169,consulting,0.037538
550,optimize,0.036200
190,growing,0.030633
85,role,0.022400
32,improvement,0.010254
48,disability,0.010174
672,innovative,0.010088
625,internal,0.009942
27,object,0.009793
267,technical product,0.008679


In [None]:
#Try topic modeling to get features (one for each topic)
#corpus is a two dimensional list of tuples (wordtoken, count).
def text2corpus(texts, method= 'count'):
    documents = pps.raw_cleaning(texts, False)
    texts = [[word for word in document.lower().split()] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    if method == 'tfidf':
        tfidf = models.TfidfModel(corpus)
        return tfidf[corpus]
    return corpus, dictionary
#result = text2corpus(df['snippet'])
corpus, dictionary = text2corpus(df['snippet'], 'count')
model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
#get a feature table from the topic model
topic_features = pd.DataFrame()
for i in range(len(corpus)):
    one_row = {str(n):0 for n in range(100)}
    features = model[corpus[i]]
    for j in range(len(features)):
        one_row[str(features[j][0])] = features[j][1]
    topic_features = topic_features.append(pd.Series(one_row, name=str(i)))
topic_features.head()
topic_features.to_csv("../metrics/topic_features.csv")

In [None]:
X = topic_features.iloc[:,:100]
y = topic_features['jobtitle']

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1, n_estimators = 100)
clf3 = GaussianNB()
#clf4 = DecisionTreeClassifier(max_depth=4)
clf4 = KNeighborsClassifier(n_neighbors=7)
#clf6 = SVC(kernel='rbf', probability=True)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('knn', clf4)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, clf4, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'K neighbours',' Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))