In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv(r"E:\jobs_dataset_with_features.csv")

In [3]:
df.head()

Unnamed: 0,Role,Features
0,Social Media Manager,5 to 15 Years Digital Marketing Specialist M.T...
1,Frontend Web Developer,"2 to 12 Years Web Developer BCA HTML, CSS, Jav..."
2,Quality Control Manager,0 to 12 Years Operations Manager PhD Quality c...
3,Wireless Network Engineer,4 to 11 Years Network Engineer PhD Wireless ne...
4,Conference Manager,1 to 12 Years Event Manager MBA Event planning...


In [4]:
df.shape

(1615940, 2)

In [7]:
# dropping classes with less than 6500 instances
min_count = 6500
role_counts = df['Role'].value_counts()
dropped_classes = role_counts[role_counts < min_count].index
filtered_df = df[~df['Role'].isin(dropped_classes)].reset_index(drop=True)

# checking the uploaded role counts
filtered_df['Role'].value_counts()

Role
Interaction Designer          20580
Network Administrator         17470
User Interface Designer       14036
Social Media Manager          13945
User Experience Designer      13935
                              ...  
Benefits Coordinator           6839
Research Analyst               6830
Administrative Coordinator     6803
IT Support Specialist          6799
UI/UX Designer                 6743
Name: count, Length: 61, dtype: int64

In [10]:
len(filtered_df['Role'].value_counts())

61

In [11]:
df = filtered_df.sample(n=10000)

In [12]:
df.head()

Unnamed: 0,Role,Features
449764,Network Administrator,5 to 15 Years Network Engineer MBA Network man...
479250,Research Analyst,2 to 14 Years Research Scientist PhD Data coll...
490913,Social Media Manager,3 to 13 Years Digital Marketing Specialist BCA...
13231,Retirement Planner,1 to 9 Years Financial Advisor M.Com Retiremen...
227316,Customer Support Specialist,5 to 8 Years Customer Success Manager BBA Cust...


In [16]:
x = df['Features']
y = df['Role']

In [17]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [18]:
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [19]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(x_train_tfidf,y_train)

y_pred = rf_classifier.predict(x_test_tfidf)
print('accuracy : ',accuracy_score(y_test,y_pred))

accuracy :  1.0


In [20]:
import re
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText



def predict_category(resume_text):
    resume_text= cleanResume(resume_text)
    resume_tfidf = tfidf_vectorizer.transform([resume_text])
    predicted_category = rf_classifier.predict(resume_tfidf)[0]
    return predicted_category

In [21]:
resume_file = """"📍 New York, USA | 📧 johndoe@example.com | 📞 +1 234-567-8901 | 🔗 LinkedIn Profile

Professional Summary
Results-driven Data Scientist with 4+ years of experience in predictive modeling, machine learning, and data analysis. Passionate about leveraging data-driven insights to solve complex business problems. Skilled in Python, SQL, and cloud-based analytics platforms.

Work Experience
Data Scientist | ABC Tech Solutions (Jan 2021 – Present)
📍 New York, USA

Developed a customer churn prediction model, reducing churn by 20%.
Built automated dashboards in Power BI to visualize sales trends.
Optimized ML models, improving prediction accuracy by 15%.
Data Analyst | XYZ Corporation (Jun 2018 – Dec 2020)
📍 San Francisco, USA

Conducted exploratory data analysis (EDA) to improve business strategies.
Improved data processing efficiency, reducing runtime by 30%.
Created SQL queries and reports to analyze customer behavior.
Education
🎓 Bachelor’s in Computer Science | University of California, Berkeley (2014 – 2018)

Skills
✔ Programming: Python, R, SQL
✔ Machine Learning: Scikit-learn, TensorFlow
✔ Data Visualization: Matplotlib, Seaborn, Power BI
✔ Cloud Platforms: AWS, Google Cloud
✔ Tools: Git, Jupyter Notebook

"""

predicted_category = predict_category(resume_file)
print("Predicted Category:", predicted_category)

Predicted Category: Data Analyst


In [22]:
import pickle
pickle.dump(rf_classifier,open('E:\ResumePrasingSavedModels/rf_classifier_jobRecommendation.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('E:\ResumePrasingSavedModels/tfidf_vectorizer_jobRecommendation.pkl','wb'))