# Resume Classification using GPT-3
## Part 1: Data Preprocessing

### Team 12

In [None]:
import pandas as pd
import numpy as np

In [None]:
file_name = 'UpdatedResumeDataSet'
data_path = '/content/' + file_name + '.csv'

In [None]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


## Removing Stopwords and Lemmatizing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

In [None]:
def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r"http\S+", "",text)
    html=re.compile(r'<.*?>')
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'')
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    return text

In [None]:
df['Resume'] = df['Resume'].apply(lambda x: clean(x))
df.head()

Unnamed: 0,Category,Resume
0,Data Science,"skill programming language python pandas, nump..."
1,Data Science,education detail may may uit rgpv data scienti...
2,Data Science,"area interest deep learning, control system de..."
3,Data Science,skill r python sap hana tableau sap hana sql s...
4,Data Science,"education detail mca ymcaust, faridabad, harya..."


## Find the 10 most frequent words, hand-pick the uninformative ones, and remove those words.

The purpose is to reduce the size of prompts that would be feed into GPT-3, which would save both time and money with little to none sacrifice on classification accuracy (might actually increase accuracy due to the increase in Signal-to-Noise Ratio, more expriments are needed for a conclusion).

In [None]:
from collections import Counter

In [None]:
Counter(" ".join(df["Resume"]).split()).most_common(20)

[('project', 3990),
 ('exprience', 3829),
 ('company', 3571),
 ('month', 3344),
 ('detail', 3132),
 ('description', 3122),
 ('data', 2086),
 ('team', 2074),
 ('skill', 2017),
 ('system', 1848),
 (',', 1845),
 ('management', 1811),
 ('year', 1505),
 ('database', 1469),
 ('client', 1460),
 ('maharashtra', 1376),
 ('application', 1325),
 ('service', 1295),
 ('technology', 1290),
 ('testing', 1266)]

In [None]:
uninformative_words = ['project',
                       'experience',
                       'company',
                       'detail',
                       'description',
                       'skill',
                       ',',
                       'year',
                       ]
#I PERSONALLY think that these words are not strong predictors for resume classification
for word in uninformative_words:
    df['Resume'] = df['Resume'].str.replace(word, '')

In [None]:
Counter(" ".join(df["Resume"]).split()).most_common(10)

[('exprience', 3829),
 ('month', 3344),
 ('data', 2156),
 ('team', 2137),
 ('management', 2024),
 ('system', 1922),
 ('database', 1526),
 ('client', 1463),
 ('maharashtra', 1449),
 ('testing', 1349)]

Now looks better.

## Saving to csv

In [None]:
file_name = 'UpdatedResumeDataset_preprocessed'
data_path = '/content/' + file_name + '.csv'

In [None]:
df.to_csv(data_path)

## End of Data Preprocessing