In [130]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [131]:
from preprocessing_NLP import pipeline

resume_df = pipeline('UpdatedResumeDataSet_T1_7.csv', feature_name='Resume')
resume_df = resume_df.reset_index(drop=True)

In [132]:
resume_df

Unnamed: 0,Category,Resume
0,Data Science,qwtnrvduof education detail may 2013 may 2017 ...
1,Data Science,qwtnrvduof area interest deep learn control sy...
2,Data Science,skill r python sap hana tableau sap hana sql s...
3,Data Science,education detail mca ymcaust faridabad haryana...
4,Data Science,skill c basic iot python matlab data science m...
...,...,...
183,Testing,skill set o window xp 7 8 8bntgbqlmkk1 10 data...
184,Testing,good logical analytical skill positive attitud...
185,Testing,personal skill quick learner eagerness learn n...
186,DevOps Engineer,core skill project program management agile sc...


In [133]:
X = resume_df['Resume']
Y = resume_df['Category']

In [134]:
encoder = LabelEncoder()
tfidf_vectorizer = TfidfVectorizer()

In [135]:
y = encoder.fit_transform(Y)
x = tfidf_vectorizer.fit_transform(X)

In [136]:
y

array([ 6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1,  1,  1,  1,  1,  1,  1, 24, 24, 24, 24, 24, 16, 16, 16, 16, 16,
       22, 22, 22, 22, 22, 22, 14, 14, 14, 14, 14, 14, 14,  5,  5,  5,  5,
        5,  5,  5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
        4,  4,  4,  4,  4,  4,  4, 21, 21, 21, 21, 21, 21, 21,  2,  2,  2,
        2,  2,  2,  2, 11, 11, 11, 11, 11, 11, 18, 18, 18, 18, 20, 20, 20,
       20, 20, 20, 20,  8,  8,  8,  8,  8,  8,  8, 17, 17, 17, 17, 17, 17,
       19, 19, 19, 19,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, 13, 13,
       13, 13, 13, 13, 13, 13, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,
        9,  9,  9,  3,  3,  3,  3,  3, 23, 23, 23, 23, 23, 23, 23, 23,  8,
        4])

In [137]:
x.shape

(188, 7106)

In [138]:
x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y,
    shuffle=True, 
    test_size=0.3, 
    random_state=42
)

In [139]:
x_train.shape

(131, 7106)

In [140]:
bayes = BernoulliNB()

In [141]:
bayes.fit(x_train, y_train)

In [142]:
y_pred = bayes.predict(x_test)
accuracy_score(y_test, y_pred)

0.08771929824561403

In [153]:
train_pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', BernoulliNB())
])

param_grid = {
    'vect__max_features': (50, 100, 1000, 2000),
    'vect__ngram_range': ((1, 1), (1, 2), (1,3), (1, 4), (1, 5)),
    'vect__norm': ('l1', 'l2', None),
}

In [154]:
grid_search = GridSearchCV(train_pipeline, param_grid=param_grid, verbose=3, scoring='accuracy', cv=5)
grid_search.fit(X, Y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l1;, score=0.211 total time=   0.0s
[CV 2/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l1;, score=0.237 total time=   0.0s
[CV 3/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l1;, score=0.368 total time=   0.0s
[CV 4/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l1;, score=0.378 total time=   0.0s
[CV 5/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l1;, score=0.351 total time=   0.0s
[CV 1/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l2;, score=0.211 total time=   0.0s
[CV 2/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l2;, score=0.237 total time=   0.0s
[CV 3/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l2;, score=0.368 total time=   0.0s




[CV 4/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l2;, score=0.378 total time=   0.0s
[CV 5/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=l2;, score=0.351 total time=   0.0s
[CV 1/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=None;, score=0.211 total time=   0.0s
[CV 2/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=None;, score=0.237 total time=   0.0s
[CV 3/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=None;, score=0.368 total time=   0.0s
[CV 4/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=None;, score=0.378 total time=   0.0s
[CV 5/5] END vect__max_features=50, vect__ngram_range=(1, 1), vect__norm=None;, score=0.351 total time=   0.0s
[CV 1/5] END vect__max_features=50, vect__ngram_range=(1, 2), vect__norm=l1;, score=0.211 total time=   0.0s
[CV 2/5] END vect__max_features=50, vect__ngram_range=(1, 2), vect__norm=l1;, score=0.211 total time=   0.0s
[CV 3/5] 

In [155]:
print("Accuracy", grid_search.best_score_, "\nParams", grid_search.best_params_)

Accuracy 0.42574679943101 
Params {'vect__max_features': 1000, 'vect__ngram_range': (1, 3), 'vect__norm': 'l1'}
