In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
from google.colab import files
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

# Function to clean resumes
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText

In [None]:
# Upload the file
uploaded = files.upload()

# Load the uploaded file into a Pandas DataFrame
df = pd.read_csv(next(iter(uploaded)))

# Display the dataframe
print(df.head())


In [None]:
# Plot the category distribution
counts = df['Category'].value_counts()
labels = df['Category'].unique()
plt.figure(figsize=(15,10))
plt.pie(counts, labels=labels, autopct='%1.1f%%', shadow=True, colors=plt.cm.plasma(np.linspace(0,1,len(labels))))
plt.show()

In [None]:
# Clean the resumes
df['CleanedResume'] = df['Resume'].apply(cleanResume)

In [None]:
# One-hot encode the categories
onehot_encoder = OneHotEncoder(sparse=False)
categories_encoded = onehot_encoder.fit_transform(df[['Category']])

In [None]:
# Vectorize the resumes using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(df['CleanedResume'])
requiredText = tfidf_vectorizer.transform(df['CleanedResume'])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(requiredText, categories_encoded, test_size=0.2, random_state=42)

# Train the classifier
clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(X_train, y_train)

In [None]:
# Predict on the test set
ypred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, ypred)}")

In [None]:
# Save the model and vectorizer
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))
pickle.dump(clf, open('model.pkl', 'wb'))
pickle.dump(onehot_encoder, open('onehot_encoder.pkl', 'wb'))

# Load the trained classifier, vectorizer, and encoder
model = pickle.load(open('model.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))
onehot_encoder = pickle.load(open('onehot_encoder.pkl', 'rb'))

In [None]:

# Example resume to predict its category
myresume = """NOOR SAEED

ABOUT ME
I am a data scientist specializing in machine learning, deep learning, and computer vision. With a strong background in mathematics, statistics, and programming, I am passionate about uncovering hidden patterns and insights in data. I have extensive experience in developing predictive models, implementing deep learning algorithms, and designing computer vision systems. My technical skills include proficiency in Python, Sklearn, TensorFlow, and PyTorch. What sets me apart is my ability to effectively communicate complex concepts to diverse audiences. I excel in translating technical insights into actionable recommendations that drive informed decision-making. If you're looking for a dedicated and versatile data scientist to collaborate on impactful projects, I am eager to contribute my expertise. Let's harness the power of data together to unlock new possibilities and shape a better future.

Contact & Sources

Email: 611noorsaeed@gmail.com
Phone: 03442826192
Github: https://github.com/611noorsaeed
LinkedIn: https://www.linkedin.com/in/noor-saeed654a23263/
Blogs: https://medium.com/@611noorsaeed
YouTube: Artificial Intelligence
WORK EXPERIENCE

Data Scientist
XYZ Tech Solutions (Jan 2022 - Present)

Developed and deployed machine learning models for predictive analytics, improving accuracy by 15%.
Implemented deep learning algorithms for image recognition, achieving a 92% success rate in classification tasks.
Designed and optimized recommendation systems, enhancing user engagement by 20%.
Conducted statistical analysis and data visualization to support business strategy and decision-making.
Junior Data Scientist
ABC Analytics (Jun 2020 - Dec 2021)

Assisted in developing machine learning models for various business use cases.
Participated in data preprocessing and feature engineering for large datasets.
Evaluated model performance and provided insights for improvement.
Collaborated with cross-functional teams to translate technical findings into business solutions.
Intern Data Analyst
Data Insight Co. (Jan 2019 - May 2020)

Analyzed data trends and patterns to provide actionable insights.
Developed data dashboards using Python and SQL to visualize key metrics.
Assisted in creating reports and presentations for stakeholders.
SKILLS

Machine Learning
Deep Learning
Computer Vision
Recommendation Systems
Data Visualization
Programming Languages (Python, SQL)
Data Preprocessing and Feature Engineering
Model Evaluation and Deployment
Statistical Analysis
Communication and Collaboration
LANGUAGES

English
Urdu
Hindi
"""




In [None]:
# Clean the input resume
cleaned_resume = cleanResume(myresume)

# Transform the cleaned resume using the trained TfidfVectorizer
input_features = tfidf_vectorizer.transform([cleaned_resume])

# Make the prediction using the loaded classifier
prediction = model.predict(input_features)
prediction_id = onehot_encoder.inverse_transform(prediction)[0][0]

# Print the predicted category
print("Predicted Category:", prediction_id)