In [8]:
import spacy #using spacy module for vectorization
nlp= spacy.load('en_core_web_lg') #en_core_web_lg is the pre-trained model by spacy
#referred from "https://spacy.io/usage/spacy-101/"

In [9]:
import pandas as pd
import numpy as np
import re
import string
import unidecode

In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mogit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
df=pd.read_csv("UpdatedResumeDataSet.csv") #dataset downloaded from the kaggle "https://www.kaggle.com/datasets/jillanisofttech/updated-resume-dataset"

In [13]:
df.shape

(962, 2)

In [14]:
df.head()


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [15]:
df.Category.value_counts()

Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: Category, dtype: int64

In [12]:
#we need to pre-process and clean the data
def clean_words(text):
    """Basic cleaning of texts"""
    
    # remove html
    text=re.sub("(<.*?>)","",text)
    
    #remove non-ascii and digits
    text=re.sub("(\\W|\\d)"," ",text)
    
    #remove whitespace
    text=text.strip()
    
    #removing single charcters pattern
    text=re.sub(pattern='\s+[a-zA-Z]\s+' ,repl=" ", string=text)
    
    #remove accented characters
    text=unidecode.unidecode(text) #we have accented characters like a^ etc, so to remove that we are performing 
    
    #to make words into lowercase
    text=text.lower()
    
    #removing stop words from the paragraph
    words = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    text = " ".join(words)
    
    #here we are avoiding tokenization since the spacy's model takes string or doc as input not list of words
    # also we are not performing stemming and lemmatization since it will change the context of skills and other words in resume text
    
    
    return text

In [16]:
df['cleaned_text']=df.Resume.map(lambda x: clean_words(x))

In [17]:
df.head()

Unnamed: 0,Category,Resume,cleaned_text
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may may e uit rgpv data scie...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills python sap hana tableau sap hana sql sa...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...


Now that we have cleaned the text it is ready to vectorize

In [18]:
df['vectorized_data']=df.cleaned_text.apply(lambda text: nlp(text).vector)

In [19]:
df.head()

Unnamed: 0,Category,Resume,cleaned_text,vectorized_data
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...,"[0.00019259728, -0.063411176, 0.022081006, 0.5..."
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may may e uit rgpv data scie...,"[-0.38565177, 0.41584927, -0.22689901, 0.05342..."
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...,"[-0.34571993, 0.064293616, -0.47398934, 0.1207..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills python sap hana tableau sap hana sql sa...,"[-0.06141665, -0.009636918, -0.60103905, 0.345..."
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...,"[0.102590054, -0.23031569, -0.4390396, -0.2980..."


The pre-processing and vectoriztion part is done, Now we will proceed with fitting a best model for the dataset and check if it accurately predicts the results

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [21]:
x=df['vectorized_data']
y=df['Category']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.3,
    random_state=1049
)

In [23]:
knn=KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean') #using 5 as K 

In [24]:
#note that the vectorized array was embeded as numpy array so inorder to make it ready for analysis we need to flatten it out
x_train=np.stack(x_train)
x_test=np.stack(x_test)

In [25]:
knn.fit(x_train,y_train)
y_predtest=knn.predict(x_test)

In [26]:
print(classification_report(y_test, y_predtest))

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         6
                     Arts       0.80      1.00      0.89         8
       Automation Testing       1.00      0.50      0.67         4
               Blockchain       1.00      1.00      1.00        10
         Business Analyst       0.57      0.80      0.67         5
           Civil Engineer       1.00      0.50      0.67        10
             Data Science       1.00      1.00      1.00        15
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       0.77      0.91      0.83        11
         DotNet Developer       1.00      0.82      0.90        11
            ETL Developer       0.79      1.00      0.88        11
   Electrical Engineering       0.75      1.00      0.86        12
                       HR       1.00      0.47      0.64        15
                   Hadoop       1.00      1.00      1.00     

#Now we will check fitting the data with an ensemble classification model Gradient boosting classifier and check the accuracy and F1 score

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

In [28]:
gbc=GradientBoostingClassifier(learning_rate=0.1,n_estimators=100)

In [29]:
gbc.fit(x_train,y_train)
y_predtestgbc=gbc.predict(x_test)

In [30]:
print(classification_report(y_test, y_predtestgbc))

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         6
                     Arts       1.00      1.00      1.00         8
       Automation Testing       1.00      1.00      1.00         4
               Blockchain       1.00      1.00      1.00        10
         Business Analyst       1.00      1.00      1.00         5
           Civil Engineer       1.00      1.00      1.00        10
             Data Science       1.00      0.73      0.85        15
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      1.00      1.00        11
         DotNet Developer       1.00      1.00      1.00        11
            ETL Developer       1.00      1.00      1.00        11
   Electrical Engineering       1.00      1.00      1.00        12
                       HR       1.00      1.00      1.00        15
                   Hadoop       1.00      1.00      1.00     

#wow ! the results were great when compared to KNN

In [43]:
with open ("resumedata.txt", 'r') as file: #resumedata.txt is an output file from the "resume parser driver code" program
    data=file.read()

In [44]:
data

"{'name': 'ACCENTURE Application', 'email': 'mogithmogi3011@gmail.com', 'mobile_number': '8825619388', 'skills': ['Operations', 'Python', 'Pharmaceutical', 'Windows', 'Schedule', 'Engineering', 'Assembly', 'Programming', 'Os', 'C', 'System', 'Sql', 'Cloud', 'Servers', 'Itil', 'Word', 'Twitter', 'P', 'Electronics', 'Analysis', 'Automation', 'Aws', 'Improvement', 'Compliance', 'R', 'Process', 'Business stakeholders'], 'college_name': None, 'degree': ['Diploma in Mechanical Engineering', 'B.E. Mechanical Engineering'], 'designation': ['MongoDB', 'cloud computing', 'DELTA ELECTRONICS', 'Application Development Associate', 'Machine learning'], 'experience': [], 'company_names': ['Amazon'], 'no_of_pages': 1, 'total_experience': 0.0}"

We have string file, now we'll do the cleaning and pre-processing and make it ready for prediction.

In [45]:
vect_data=nlp(clean_words(data)).vector

In [46]:
vect_data

array([ 0.0806507 , -0.24738899,  0.83444464,  0.7565758 ,  1.7888505 ,
       -0.21318665,  0.59077734,  2.607889  , -2.3126903 , -0.39089495,
        4.1885962 ,  2.132107  , -3.8840594 ,  2.3537402 , -0.5531936 ,
        0.9311784 ,  2.540635  ,  1.8878503 , -1.9018221 , -0.11651053,
        0.2858188 ,  1.4764196 , -1.6714324 ,  1.0770444 , -1.5059736 ,
       -1.8305569 , -0.9204829 , -1.5873944 , -0.44437414,  0.43179452,
        0.07591999,  0.747539  , -1.1723514 ,  0.0474946 ,  1.0745327 ,
       -0.16827236,  0.9081156 ,  0.21104454,  1.2436647 ,  0.39408708,
        0.5805148 , -0.14812328, -0.16652891,  0.6432002 , -0.9368017 ,
        1.0201644 ,  1.2296575 , -2.1319969 , -0.19794177, -1.4271594 ,
       -0.28339338,  1.3461612 , -0.59750926, -2.2782116 , -1.2462511 ,
        0.71236134, -1.0881042 ,  1.2533371 ,  0.13487093, -1.361728  ,
        2.0396159 ,  1.594589  , -2.2090814 , -1.004091  ,  1.0161525 ,
        2.0857205 , -1.6362866 , -2.9381897 , -0.18468104,  1.74

Note that our output array is of 1D numpy array, but we need to feed 2D array as input, because model will consider only 2D. 
You can ignore this if you are gonna predict for a number of CV's. In our case we doing it for just one. so we are reshaping it
with .reshape(1,-1).

In [47]:
sample_pred=gbc.predict(vect_data.reshape(1, -1))

In [48]:
sample_pred

array(['Blockchain'], dtype=object)

! voila it got predicted as blockchain ....we will keep working on tuning model and the data as well...