## Import the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load the dataset

In [None]:
df = pd.read_csv('UpdatedResumeDataSet.csv')

In [None]:
df.info()

## Separate the input, output data

In [None]:
X = df['Resume']
y = df['Category']

## Exploratory Data Analytics

In [None]:
len(set(y))

In [None]:
plt.figure(figsize=(16,9))
plt.grid()
sns.countplot(y=y);

In [None]:
y.value_counts()

In [None]:
plt.figure(figsize=(8,8))
plt.pie(y.value_counts(),labels=y.value_counts().index,autopct='%2.2f%%');

In [None]:
java = X[y == 'Java Developer']

In [None]:
java

In [None]:
from nltk.tokenize import word_tokenize
java = ' '.join(java)

In [None]:
java

In [None]:
java = word_tokenize(java)

In [None]:
java

In [None]:
from wordcloud import WordCloud

In [None]:
wc = WordCloud(max_words=100).generate(' '.join(java))

In [None]:
plt.imshow(wc)

In [None]:
print("hello\rwor")

In [None]:
print('\a')

# Data Cleaning

In [None]:
import re
import string

In [None]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and CC
    resumeText = re.sub('#\S+', ' ', resumeText) # remove hashtags
    resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions
    resumeText = re.sub('[%s]' % re.escape(string.punctuation), ' ', resumeText) # remove punctuations
    resumeText = re.sub('[^\x00-\x7f]', ' ', resumeText) # remove non-ASCII characters
    resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace

    return resumeText

In [None]:
X[123]

In [None]:
print(cleanResume(X[123]))

In [None]:
cleaned = X.apply(cleanResume)

In [None]:
cleaned.shape

In [None]:
cleaned

## TF*IDF Vectorize

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfv = TfidfVectorizer(stop_words='english')

In [None]:
X_new = tfv.fit_transform(cleaned)

In [None]:
X_new

In [None]:
tfv.get_feature_names_out()

## Cross Validation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_new,y,random_state=0)

In [None]:
X_train.shape,X_test.shape

## Build the Model

In [None]:
from sklearn.neighbors import NearestCentroid

In [None]:
nn = NearestCentroid()

In [None]:
nn.fit(X_train,y_train)

## Performance Evaluation

In [None]:
y_pred = nn.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
with open('Sample_resume.txt','r') as file:
    data = file.read()

In [None]:
data=cleanResume(data)

In [None]:
#data = pd.Series(data)

In [None]:
data_new = tfv.transform([data])

In [None]:
y=nn.predict(data_new)

In [None]:
y[0]