In [1]:
import pandas as pd
import re
import sklearn
import pickle

# **Cleaning data:**

Such as URLs, hashtags, mentions, special characters, punctuations

In [2]:
df=pd.read_csv("/home/izen-abbas/venv/LSTMs/Final_Categorized.csv")
df['Resume'][0]

'Education Details \r\nAugust 2010 to May 2017 BE Electronics & Communication Jabalpur, Madhya Pradesh Takshshila institute of technology\r\nJava developer \r\n\r\n\r\nSkill Details \r\nJava, Javascript,- Exprience - 6 monthsCompany Details \r\ncompany - Wab It Softwere Pvt.  Ltd.\r\ndescription - Jr. Java Developer'

In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    clean_txt = str(text)
    clean_txt = re.sub(r'\S+@\S+', ' ', clean_txt)
    clean_txt = re.sub(r'http\S+', ' ', clean_txt)
    replacements = {
        "C++": "CPLUSPLUS", "c++": "CPLUSPLUS",
        "C#": "CSHARP", "c#": "CSHARP",
        ".NET": "DOTNET", ".net": "DOTNET",
        "Node.js": "NODEJS", "node.js": "NODEJS"
    }
    for k, v in replacements.items():
        clean_txt = clean_txt.replace(k, v)
    clean_txt = re.sub(r'[^A-Za-z0-9+\#\./\s]', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt).strip()
    inv = {v: k.lower() for k, v in replacements.items()}
    for k, v in inv.items():
        clean_txt = clean_txt.replace(k, v)
    return clean_txt.lower().strip()


In [4]:
df['Resume']=df['Resume'].apply(lambda x: clean_text(x))
df['Resume'][0]

#before converting into numbers
df['Category'].unique()

array(['Java Developer', 'Business Analyst', 'Enterprise Solutions & ERP',
       'Python Developer', 'Data Warehouse & BI Developer',
       'Database Developer', 'Database Administrator',
       'Systems Administrator', 'Web Developer', 'Cybersecurity Analyst',
       'Data Scientist & Analytics', 'HR & Administration',
       'Consulting & Professional Services', 'IT Security Engineer',
       'Freelance & Independent Contractor', 'Network Administrator',
       'Full Stack Developer', 'Mobile Developer', 'Network Engineer',
       'Software Engineer', 'IT Director & CIO', 'UI/UX Designer',
       'Information Security Analyst', 'IT Program/Portfolio Manager',
       'Frontend Developer', 'IT Project Manager'], dtype=object)

# **Converting Categories into numbers**

In [5]:
le=sklearn.preprocessing.LabelEncoder()
df['Category']=le.fit_transform(df['Category'])

#after converting into numbers
df['Category'].unique()



array([17,  0,  7, 21,  4,  6,  5, 23, 25,  2,  3, 11,  1, 15,  8, 19, 10,
       18, 20, 22, 12, 24, 16, 13,  9, 14])

# **Vectorization**

In [6]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english')
resume_tfidf = tfidf.fit_transform(df['Resume'])
print(resume_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1866796 stored elements and shape (17110, 29922)>
  Coords	Values
  (0, 8720)	0.1309402842154917
  (0, 7601)	0.4831087842281492
  (0, 2706)	0.21862538495564318
  (0, 206)	0.09213379705841457
  (0, 215)	0.07338619559739704
  (0, 8860)	0.12755674906340367
  (0, 5819)	0.08613881681198299
  (0, 14011)	0.2746553611770998
  (0, 15970)	0.2490121619121687
  (0, 20623)	0.19880645212209477
  (0, 26039)	0.28625559109442017
  (0, 13373)	0.10895014649616216
  (0, 26264)	0.06826891055072465
  (0, 14088)	0.21706337769709697
  (0, 7632)	0.10944134221866895
  (0, 24351)	0.1613341864331558
  (0, 14105)	0.0632660252008347
  (0, 9737)	0.18308910733297792
  (0, 17355)	0.18308910733297792
  (0, 5837)	0.10785287240006153
  (0, 28687)	0.28625559109442017
  (0, 24627)	0.26612242147490894
  (0, 21268)	0.13376305365906602
  (0, 7553)	0.17062410183438148
  (0, 14377)	0.14522316256733567
  :	:
  (17109, 26021)	0.07043359448339516
  (17109, 18049)	0.0561

# **Splitting**


In [7]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(resume_tfidf, df['Category'], test_size=0.2, random_state=42)

# **Building by using ML**

In [8]:
#Model#1

clf1 = sklearn.svm.SVC(kernel='linear',probability=True)
clf1.fit(x_train, y_train)

ypred1 = clf1.predict(x_test)

print(ypred1)
print(sklearn.metrics.accuracy_score(y_test, ypred1))

[13 13  9 ...  2  9  6]
0.9374634716540035


In [9]:
#Model#2

clf2=sklearn.linear_model.LogisticRegression(max_iter=500)
clf2.fit(x_train, y_train)
ypred2=clf2.predict(x_test)
print("Logistic Regression Accuracy:")
print(sklearn.metrics.accuracy_score(y_test, ypred2))

Logistic Regression Accuracy:
0.8296317942723553


In [10]:
#Model#3 - Random Forest

from sklearn.ensemble import RandomForestClassifier

clf3 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf3.fit(x_train, y_train)
ypred3 = clf3.predict(x_test)
print("\nRandom Forest Accuracy:")
print(sklearn.metrics.accuracy_score(y_test, ypred3))


Random Forest Accuracy:
0.988895382817066


In [None]:
#Model Comparison

print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)

models = {
    'SVM (Linear)': sklearn.metrics.accuracy_score(y_test, ypred1),
    'Logistic Regression': sklearn.metrics.accuracy_score(y_test, ypred2),
    'Random Forest': sklearn.metrics.accuracy_score(y_test, ypred3),
}

for model_name, accuracy in sorted(models.items(), key=lambda x: x[1], reverse=True):
    print(f"{model_name:25} : {accuracy:.4f} ({accuracy*100:.2f}%)")

best_model_name = max(models, key=models.get)
print(f"\n✓ Best Model: {best_model_name} with accuracy {models[best_model_name]:.4f}")


MODEL COMPARISON
XGBoost                   : 0.9895 (98.95%)
Random Forest             : 0.9889 (98.89%)
SVM (Linear)              : 0.9375 (93.75%)
Logistic Regression       : 0.8296 (82.96%)

✓ Best Model: XGBoost with accuracy 0.9895


In [None]:
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
pickle.dump(clf1, open('clf1.pkl', 'wb'))
pickle.dump(clf2, open('clf2.pkl', 'wb'))
pickle.dump(clf3, open('clf3_rf.pkl', 'wb'))
pickle.dump(le, open('le.pkl', 'wb'))
pickle.dump(x_test, open('x_test.pkl', 'wb'))
pickle.dump(y_test, open('y_test.pkl', 'wb'))

print("\n✓ All models saved successfully!")

sample_resume= "Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), R, Sql, Spark, Scala. * Machine learning: Deep Learning, CNN, RNN, Transformers, Regression, SVM, Random Forest, Ensemble Methods, Natural Language processing, Dimensionality reduction, Cluster Analysis, Time Series Analysis. * Database Visualizations: Mysql, PostgresSQL, MongoDB, Tableau, PowerBI, D3.js. * Others: Regular Expression, HTML, CSS, Git, Docker, Kubernetes, AWS, GCP, computer vision - OpenCV. Education Details Master of Science in Data Science, Stanford University, 2018. Bachelor of Technology in Computer Science, IIT Delhi, 2016. Professional Experience Senior Data Scientist - Tech Innovations Co. (2018 - Present) Skill Details PYTHON- Exprience - 60 months R- Exprience - 36 months SPARK- Exprience - 24 months Company Details company - Tech Innovations Co. description - Led the development of a state-of-the-art recommendation engine using collaborative filtering and deep learning, resulting in a 25% increase in user engagement. Developed and deployed NLP models for sentiment analysis on customer feedback, improving product iteration speed by 40%. Managed a cloud-based data pipeline (AWS/GCP) processing over 1TB of data daily. Mentor junior data scientists on best practices for model building, validation, and deployment."


✓ All models saved successfully!


# **Prediction System**

In [None]:
cleaned_resume = clean_text(sample_resume)
vectorized_resume = tfidf.transform([cleaned_resume])

# Get predictions from all models
prediction1 = clf1.predict(vectorized_resume)[0]
prediction2 = clf2.predict(vectorized_resume)[0]
prediction3 = clf3.predict(vectorized_resume)[0]

category_mapping = dict(enumerate(le.classes_))

print("\n" + "="*50)
print("SAMPLE RESUME PREDICTIONS")
print("="*50)
print(f"\nModel#1 (SVM Linear):             {category_mapping.get(prediction1, 'Unknown')}")
print(f"Model#2 (Logistic Regression):    {category_mapping.get(prediction2, 'Unknown')}")
print(f"Model#3 (Random Forest):          {category_mapping.get(prediction3, 'Unknown')}")


SAMPLE RESUME PREDICTIONS

Model#1 (SVM Linear):             Data Scientist & Analytics
Model#2 (Logistic Regression):    Python Developer
Model#3 (Random Forest):          Python Developer
Model#4 (XGBoost):                Data Scientist & Analytics
