In [150]:
FILE_PATH='UpdatedResumeDataSet.csv'

# Import Libraries & Load Dataset

In [149]:
import pandas as pd
import numpy as np 
import plotly
import plotly.express as px
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
import pickle

# -- Settings Plotly template

In [151]:
template_style = "plotly_white"

### Load DataFrame

In [152]:
df=pd.read_csv(FILE_PATH)
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [153]:
df.isnull().sum()

Category    0
Resume      0
dtype: int64

# Explore Dataset

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 7.6+ KB


In [175]:
df['Resume'].sample(1).iloc[0]

'TECHNICAL STRENGTHS Computer Language Java J2EE Swift HTML Shell script MySQL Databases MySQL Tools SVN Jenkins Hudson Weblogic12c Software Android Studio Eclipse Oracle Xcode Operating Systems Win 10 Mac High Sierra Education Details June 2016 B E Information Technology Goregaon MAHARASHTRA IN Vidyalankar Institute of Technology May 2013 Mumbai Maharashtra Thakur Polytechnic May 2010 Mumbai Maharashtra St John s Universal School Java developer Java developer Tech Mahindra Skill Details JAVA Exprience 21 months MYSQL Exprience 21 months DATABASES Exprience 17 months J2EE Exprience 17 months ANDROID Exprience 6 monthsCompany Details company Tech Mahindra description Team Size 5 Environment Java Mysql Shell script Webserver Jenkins Description OR Formatter is an application which takes the input file as Geneva Modified File GMF from Geneva server and reads the data to generate Bill backup and Bill Invoices for Client customers of BT These invoices would be sent across to all the clients

### Cleaning text

In [156]:
def cleaning_text(text): 
    cleantext=re.sub('http\S+\s',' ', text)
    cleantext = re.sub('RT|cc', ' ',cleantext)
    cleantext=re.sub('#\S+\s',' ',cleantext)
    cleantext = re.sub('@\S+', '  ', cleantext)
    cleantext = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleantext)
    cleantext = re.sub(r'[^\x00-\x7f]', ' ', cleantext) 
    cleantext = re.sub('\s+', ' ', cleantext) 
    
    return cleantext

In [157]:
df['Resume']=df['Resume'].apply(lambda x:cleaning_text(x))

In [176]:
df['Resume'].sample(1).iloc[0]

'Software Proficiency Languages Basics of C SQL PL SQL JAVA JAVAEE Javascript HTML CSS jquery mysql Spring Hibernate Software Tools Xillinx Modelsim Matlab Multisim Operating Systems Windows XP Vista 07 08 Ubuntu Project Profile B E Project FPGA Implementation of Team Size 4 Role Programmer AES Algorithm AES is Advanced Encryption Standard which is used in cryptography by which we can protect our data It encrypted by a Secret Key T E project Sorting Robot Team Size 3 Role Mechanism designer The TCS 230 sensor sorts the RGB color balls a ording to their color Diploma Project RFID Based Student Team Size 4 Role Interface Attendance System Using GSM In this student show RFID card of his own and then message send via GSM to their parent that his ward is present Education Details May 2016 B E Savitribai Phule Pune Maharashtra Pune University March 2010 S S C Maharashtra Board DevOps Engineer Skill Details C Exprience 6 months C Exprience 6 months Sql Exprience 6 months Pl Sql Exprience 6 mo

In [159]:
df_categort=df['Category'].value_counts().reset_index().sort_values('Category',ascending=False)

### Visualization

In [160]:
# Create Chart
fig = px.bar(df_categort,
              x='index',
              y='Category',
              template = template_style,
              title= '<b>Number of jobs</b>')

# Display Plot
fig.show()

#### Get a view of unique values in column, e.g. 'Category'

In [161]:
df['Category'].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

### convert categort to Numbers

In [162]:
le = LabelEncoder()
le.fit(df['Category'])
df['Category'] = le.transform(df['Category'])

In [163]:
df['Category'].unique()

array([ 6, 12,  0,  1, 24, 16, 22, 14,  5, 15,  4, 21,  2, 11, 18, 20,  8,
       17, 19,  7, 13, 10,  9,  3, 23])

In [164]:
# Map category ID to category name
category_mapping = {
    15: "Java Developer",
    23: "Testing",
    8: "DevOps Engineer",
    20: "Python Developer",
    24: "Web Designing",
    12: "HR",
    13: "Hadoop",
    3: "Blockchain",
    10: "ETL Developer",
    18: "Operations Manager",
    6: "Data Science",
    22: "Sales",
    16: "Mechanical Engineer",
    1: "Arts",
    7: "Database",
    11: "Electrical Engineering",
    14: "Health and fitness",
    19: "PMO",
    4: "Business Analyst",
    9: "DotNet Developer",
    2: "Automation Testing",
    17: "Network Security Engineer",
    21: "SAP Developer",
    5: "Civil Engineer",
    0: "Advocate",
}

In [165]:
category_mapping.values()

dict_values(['Java Developer', 'Testing', 'DevOps Engineer', 'Python Developer', 'Web Designing', 'HR', 'Hadoop', 'Blockchain', 'ETL Developer', 'Operations Manager', 'Data Science', 'Sales', 'Mechanical Engineer', 'Arts', 'Database', 'Electrical Engineering', 'Health and fitness', 'PMO', 'Business Analyst', 'DotNet Developer', 'Automation Testing', 'Network Security Engineer', 'SAP Developer', 'Civil Engineer', 'Advocate'])

### convert Resume text to TFIDF

In [177]:

tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(df['Resume'])
requredTaxt  = tfidf.transform(df['Resume'])

## spliting Data

In [178]:
X_train, X_test, y_train, y_test = train_test_split(requredTaxt, df['Category'], test_size=0.2, random_state=42)

### Create Model for Predictaion

In [179]:
clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

### show results predict

In [180]:
d_f=[]
for i in y_pred:
    d_f.append(category_mapping.get(i, "Unknown"))
    
d_f[:10]

['Java Developer',
 'Java Developer',
 'Java Developer',
 'Hadoop',
 'Health and fitness',
 'Network Security Engineer',
 'Mechanical Engineer',
 'Automation Testing',
 'Advocate',
 'Health and fitness']

In [181]:
#accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
print(classification_report(y_test,y_pred))
              

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00         4
           5       1.00      1.00      1.00         9
           6       1.00      0.60      0.75         5
           7       1.00      1.00      1.00         8
           8       1.00      0.93      0.96        14
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         7
          11       1.00      1.00      1.00         6
          12       1.00      1.00      1.00        12
          13       1.00      1.00      1.00         4
          14       1.00      1.00      1.00         7
          15       1.00      1.00      1.00        15
          16       1.00      1.00      1.00         8
          17       1.00    

### save model and TFIDF

In [171]:
pickle.dump(tfidf,open('tfidf.pkl','wb'))
pickle.dump(clf, open('clf.pkl', 'wb'))

#### open file resume

In [172]:
with open('mahmoudibrahim.TXT','r')as f:
    data=f.read()
data    

"I am a data scientist specializing in machine\nlearning, deep learning, and computer vision. With\na strong background in mathematics, statistics,\nand programming, I am passionate about\nuncovering hidden patterns and insights in data.\nI have extensive experience in developing\npredictive models, implementing deep learning\nalgorithms, and designing computer vision\nsystems. My technical skills include proficiency in\nPython, Sklearn, TensorFlow, and PyTorch.\nWhat sets me apart is my ability to effectively\ncommunicate complex concepts to diverse\naudiences. I excel in translating technical insights\ninto actionable recommendations that drive\ninformed decision-making.\nIf you're looking for a dedicated and versatile data\nscientist to collaborate on impactful projects, I am\neager to contribute my expertise. Let's harness the\npower of data together to unlock new possibilities\nand shape a better future.\nContact & Sources\nEmail: mahmoudibrahim@gmail.com\nPhone: 01149973327\nGith

In [173]:
def Resume(cv):
    for i in range(len(cv)):
        cleaned_resume = cleaning_text(cv)
        # Transform the cleaned resume using the trained TfidfVectorizer
        input_features = tfidf.transform([cleaned_resume])
        # Make the prediction using the loaded classifier
        prediction_id = clf.predict(input_features)[0]
        
        category_name = category_mapping.get(prediction_id, "Unknown")
        
        print("Predicted Category:", category_name)
        predictions = clf.predict_proba(input_features)
        DF=pd.DataFrame(predictions,columns=sorted(category_mapping.values()))
        
        return DF
        
        
        
        
    

In [182]:
Resume(data)

Predicted Category: Data Science


Unnamed: 0,Advocate,Arts,Automation Testing,Blockchain,Business Analyst,Civil Engineer,Data Science,Database,DevOps Engineer,DotNet Developer,...,Java Developer,Mechanical Engineer,Network Security Engineer,Operations Manager,PMO,Python Developer,SAP Developer,Sales,Testing,Web Designing
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
