In [1]:
import pandas as pd
import re
import sklearn
import pickle

# **Cleaning data:**

Such as URLs, hashtags, mentions, special characters, punctuations

In [2]:
df=pd.read_csv('Final_Categorized.csv')
df['Resume'][0]

'Education Details \r\nAugust 2010 to May 2017 BE Electronics & Communication Jabalpur, Madhya Pradesh Takshshila institute of technology\r\nJava developer \r\n\r\n\r\nSkill Details \r\nJava, Javascript,- Exprience - 6 monthsCompany Details \r\ncompany - Wab It Softwere Pvt.  Ltd.\r\ndescription - Jr. Java Developer'

In [3]:
def clean_text(text):
    clean_txt = text
    # 2. Remove backlash since experience includes date in dd\mm\yy
    clean_txt = re.sub(r'\\', ' ', clean_txt)
    # 3. Remove common metadata/headers (RT, cc, etc.)
    clean_txt = re.sub(r'RT|cc|CC|rt', '', clean_txt)
    # 4. Remove URLs
    clean_txt = re.sub(r'http\S+', '', clean_txt)
    # 5. Remove mentions
    clean_txt = re.sub(r'@[A-Za-z0-9]+', '', clean_txt)
    # 6. Remove hashtags
    clean_txt = re.sub(r'#', '', clean_txt)
    # 7. Remove special characters and punctuation (use on clean_txt)
    clean_txt = re.sub(r'[^A-Za-z0-9\s]+', '', clean_txt)
    # 8. Remove extra spaces
    clean_txt = re.sub(r'\s+', ' ', clean_txt).strip() # .strip() removes leading/trailing space
    # 9 I found NaN in some resumes
    clean_txt = re.sub(r'NaN', ' ', clean_txt).strip() # .strip() removes leading/trailing space
    return clean_txt


In [4]:
df['Resume']=df['Resume'].apply(lambda x: clean_text(x))
df['Resume'][0]

#before converting into numbers
df['Category'].unique()

array(['Java Developer', 'Business Analyst', 'Enterprise Solutions & ERP',
       'Python Developer', 'Data Warehouse & BI Developer',
       'Database Developer', 'Database Administrator',
       'Systems Administrator', 'Web Developer', 'Cybersecurity Analyst',
       'Data Scientist & Analytics', 'HR & Administration',
       'Consulting & Professional Services', 'IT Security Engineer',
       'Freelance & Independent Contractor', 'Network Administrator',
       'Full Stack Developer', 'Mobile Developer', 'Network Engineer',
       'Software Engineer', 'IT Director & CIO', 'UI/UX Designer',
       'Information Security Analyst', 'IT Program/Portfolio Manager',
       'Frontend Developer', 'IT Project Manager'], dtype=object)

# **Converting Categories into numbers**

In [5]:
le=sklearn.preprocessing.LabelEncoder()
df['Category']=le.fit_transform(df['Category'])

#after converting into numbers
df['Category'].unique()



array([17,  0,  7, 21,  4,  6,  5, 23, 25,  2,  3, 11,  1, 15,  8, 19, 10,
       18, 20, 22, 12, 24, 16, 13,  9, 14])

# **Vectorization**

In [6]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english')
resume_tfidf = tfidf.fit_transform(df['Resume'])
print(resume_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1518062 stored elements and shape (17110, 146645)>
  Coords	Values
  (0, 52563)	0.12998499333347374
  (0, 46571)	0.46830469508700306
  (0, 26103)	0.20707938773585757
  (0, 10170)	0.1484181282722205
  (0, 11134)	0.15338609728058528
  (0, 52978)	0.12332785476322786
  (0, 35855)	0.09916783350921812
  (0, 71840)	0.26015032079858474
  (0, 78925)	0.23586138470615473
  (0, 99233)	0.1901560215487237
  (0, 131890)	0.27113792184665725
  (0, 69156)	0.10604191412507374
  (0, 133323)	0.06927114882053073
  (0, 71922)	0.25323252886723174
  (0, 46680)	0.10580754120668603
  (0, 122685)	0.156738945900587
  (0, 72634)	0.11537701782391677
  (0, 57300)	0.17341984443073769
  (0, 87307)	0.17341984443073769
  (0, 36177)	0.10643779865352977
  (0, 143199)	0.27113792184665725
  (0, 123932)	0.2520680208887404
  (0, 104781)	0.12806447309935784
  (0, 45042)	0.16199267859066932
  (0, 73698)	0.19212300721015754
  :	:
  (17109, 143269)	0.09119767540483527
 

# **Splitting**


In [7]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(resume_tfidf, df['Category'], test_size=0.2, random_state=42)

# **Building by using ML**

In [8]:
#Model#1

clf1 = sklearn.svm.SVC(kernel='linear',probability=True)
clf1.fit(x_train, y_train)

ypred1 = clf1.predict(x_test)

print(ypred1)
print(sklearn.metrics.accuracy_score(y_test, ypred1))

[13 13  9 ...  2  9  6]
0.9845119812974868


In [9]:
#Model#2

clf2=sklearn.linear_model.LogisticRegression()
clf2.fit(x_train, y_train)
ypred2=clf2.predict(x_test)
print(ypred2)
print(sklearn.metrics.accuracy_score(y_test, ypred2))

[13 14  9 ...  2  9  6]
0.8763880771478667


# **Saving Model**

In [10]:
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
pickle.dump(clf1, open('clf1.pkl', 'wb'))
pickle.dump(clf2, open('clf2.pkl', 'wb'))
pickle.dump(le, open('le.pkl', 'wb'))
pickle.dump(x_test, open('x_test.pkl', 'wb'))
pickle.dump(y_test, open('y_test.pkl', 'wb'))

sample_resume= "Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), R, Sql, Spark, Scala. * Machine learning: Deep Learning, CNN, RNN, Transformers, Regression, SVM, Random Forest, Ensemble Methods, Natural Language processing, Dimensionality reduction, Cluster Analysis, Time Series Analysis. * Database Visualizations: Mysql, PostgresSQL, MongoDB, Tableau, PowerBI, D3.js. * Others: Regular Expression, HTML, CSS, Git, Docker, Kubernetes, AWS, GCP, computer vision - OpenCV. Education Details Master of Science in Data Science, Stanford University, 2018. Bachelor of Technology in Computer Science, IIT Delhi, 2016. Professional Experience Senior Data Scientist - Tech Innovations Co. (2018 - Present) Skill Details PYTHON- Exprience - 60 months R- Exprience - 36 months SPARK- Exprience - 24 months Company Details company - Tech Innovations Co. description - Led the development of a state-of-the-art recommendation engine using collaborative filtering and deep learning, resulting in a 25% increase in user engagement. Developed and deployed NLP models for sentiment analysis on customer feedback, improving product iteration speed by 40%. Managed a cloud-based data pipeline (AWS/GCP) processing over 1TB of data daily. Mentor junior data scientists on best practices for model building, validation, and deployment."


# **Prediction System**

In [11]:
cleaned_resume = clean_text(sample_resume)
vectorized_resume = tfidf.transform([cleaned_resume])

prediction1 = clf1.predict(vectorized_resume)[0]
prediction2 = clf2.predict(vectorized_resume)[0]
category_mapping = dict(enumerate(le.classes_))

result1 = category_mapping.get(prediction1, 'Unknown')
result2 = category_mapping.get(prediction1, 'Unknown')
print(f"\nPredicted Job by using ML (Model#1): {result1}")
result1 = category_mapping.get(prediction1, 'Unknown')
print(f"\nPredicted Job by using ML (Model#2): {result2}")


Predicted Job by using ML (Model#1): Data Scientist & Analytics

Predicted Job by using ML (Model#2): Data Scientist & Analytics
