# Data CLeaning

In [2]:
import pandas as pd

In [25]:
df = pd.read_csv("naukri_skills_jobs_saf.csv")  # or your resume dataset
df.shape
df.head()

Unnamed: 0,Skill,Title,Company,Experience,Salary,Location,JD URL,Full JD,Job ID,Required Skills,JD_Length
0,Python,Python Developer Data Scientist,Tiger Analytics,4-7 Yrs,Not disclosed,Hyderabad Chennai Bengaluru,/job-listings-python-developer-data-scientist-...,MLOps and CI CD Proficiency Experience with ML...,290525024892,"['data science', 'data structures', 'numpy', '...",53
1,Python,Python Software Developer,Iks Health,2-5 Yrs,Not disclosed,Remote,/job-listings-python-software-developer-iks-he...,"Deploy,monitor,and maintain services in cloud ...",310525014478,"['python', 'rest', 'django', 'microservices', ...",37
2,Python,Python Developer,Enterprise Minds,0-1 Yrs,Not disclosed,Bengaluru,/job-listings-python-developer-enterpriseminds...,"Bachelor s degree in Computer Science,Engineer...",300525931376,"['data manipulation', 'python development', 'n...",39
3,Python,Python Software Developer Data Engineer,Cgi,5-7 Yrs,Not disclosed,Hybrid Bengaluru,/job-listings-python-software-developer-data-e...,Preferred candidate profile .Overall 57 years ...,190525013270,"['rdbms', 'pandas', 'oracle', 'python', 'sql',...",50
4,Python,Software Developer C Python Linux,Ibm,3-6 Yrs,Not disclosed,Pune,/job-listings-software-developer-c-python-linu...,Bachelors Degree .Required educationPreferred ...,290525924852,"['gdb', 'database design', 'postgresql', 'debu...",35


In [26]:
df = df.drop(columns=["Unnamed: 0"], errors="ignore")  # Ignore if not present

In [27]:
df['Title'] = df['Title'].astype(str).str.strip().str.lower()
df['Company'] = df['Company'].astype(str).str.strip().str.lower()
df['Location'] = df['Location'].astype(str).str.strip().str.title()

In [28]:
df.head()

Unnamed: 0,Skill,Title,Company,Experience,Salary,Location,JD URL,Full JD,Job ID,Required Skills,JD_Length
0,Python,python developer data scientist,tiger analytics,4-7 Yrs,Not disclosed,Hyderabad Chennai Bengaluru,/job-listings-python-developer-data-scientist-...,MLOps and CI CD Proficiency Experience with ML...,290525024892,"['data science', 'data structures', 'numpy', '...",53
1,Python,python software developer,iks health,2-5 Yrs,Not disclosed,Remote,/job-listings-python-software-developer-iks-he...,"Deploy,monitor,and maintain services in cloud ...",310525014478,"['python', 'rest', 'django', 'microservices', ...",37
2,Python,python developer,enterprise minds,0-1 Yrs,Not disclosed,Bengaluru,/job-listings-python-developer-enterpriseminds...,"Bachelor s degree in Computer Science,Engineer...",300525931376,"['data manipulation', 'python development', 'n...",39
3,Python,python software developer data engineer,cgi,5-7 Yrs,Not disclosed,Hybrid Bengaluru,/job-listings-python-software-developer-data-e...,Preferred candidate profile .Overall 57 years ...,190525013270,"['rdbms', 'pandas', 'oracle', 'python', 'sql',...",50
4,Python,software developer c python linux,ibm,3-6 Yrs,Not disclosed,Pune,/job-listings-software-developer-c-python-linu...,Bachelors Degree .Required educationPreferred ...,290525924852,"['gdb', 'database design', 'postgresql', 'debu...",35


In [29]:
df.isnull().sum().sort_values(ascending=False)

JD URL             2
Title              0
Skill              0
Company            0
Experience         0
Salary             0
Location           0
Full JD            0
Job ID             0
Required Skills    0
JD_Length          0
dtype: int64

In [30]:
df['Experience'] = df['Experience'].fillna("Not specified")
df['Salary'] = df['Salary'].fillna("Not disclosed")
df['Full JD'] = df['Full JD'].fillna("")

In [31]:
df.isnull().sum().sort_values(ascending=False)

JD URL             2
Title              0
Skill              0
Company            0
Experience         0
Salary             0
Location           0
Full JD            0
Job ID             0
Required Skills    0
JD_Length          0
dtype: int64

In [32]:
def parse_skills(x):
    if pd.isna(x) or not isinstance(x, str):
        return []
    return [s.strip().lower() for s in x.split(",") if s.strip()]

df['Required Skills'] = df['Required Skills'].apply(parse_skills)

In [33]:
df.head()

Unnamed: 0,Skill,Title,Company,Experience,Salary,Location,JD URL,Full JD,Job ID,Required Skills,JD_Length
0,Python,python developer data scientist,tiger analytics,4-7 Yrs,Not disclosed,Hyderabad Chennai Bengaluru,/job-listings-python-developer-data-scientist-...,MLOps and CI CD Proficiency Experience with ML...,290525024892,"[['data science', 'data structures', 'numpy', ...",53
1,Python,python software developer,iks health,2-5 Yrs,Not disclosed,Remote,/job-listings-python-software-developer-iks-he...,"Deploy,monitor,and maintain services in cloud ...",310525014478,"[['python', 'rest', 'django', 'microservices',...",37
2,Python,python developer,enterprise minds,0-1 Yrs,Not disclosed,Bengaluru,/job-listings-python-developer-enterpriseminds...,"Bachelor s degree in Computer Science,Engineer...",300525931376,"[['data manipulation', 'python development', '...",39
3,Python,python software developer data engineer,cgi,5-7 Yrs,Not disclosed,Hybrid Bengaluru,/job-listings-python-software-developer-data-e...,Preferred candidate profile .Overall 57 years ...,190525013270,"[['rdbms', 'pandas', 'oracle', 'python', 'sql'...",50
4,Python,software developer c python linux,ibm,3-6 Yrs,Not disclosed,Pune,/job-listings-software-developer-c-python-linu...,Bachelors Degree .Required educationPreferred ...,290525924852,"[['gdb', 'database design', 'postgresql', 'deb...",35


In [34]:
df['JD_Length'] = df['Full JD'].astype(str).apply(lambda x: len(x.split()))
df = df[df['JD_Length'] > 30]  # Keep only meaningful JDs

In [35]:
print(df.shape)

(12476, 11)


In [36]:
import re

def clean_jd_text(text):
    if pd.isna(text):
        return ""
    text = str(text)

    # Remove asterisks, bullets, tabs
    text = text.replace('*', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('•', ' ')
    
    # Replace multiple newlines with one
    text = re.sub(r'\n+', '\n', text)

    # Replace multiple spaces with one
    text = re.sub(r' +', ' ', text)

    # Remove special characters except . , ! ? and newlines
    text = re.sub(r'[^\w\s.,!?]', '', text)

    # Strip leading/trailing whitespace
    return text.strip()


In [16]:
df['Full JD'] = df['Full JD'].apply(clean_jd_text)

In [22]:
print(df['Full JD'].iloc[1])

Deploy,monitor,and maintain services in cloud environments preferably AWS,Azure,or GCPPreferred candidate profile . 25 years of backend development experience using PythonProficient in working with SQL and NoSQL databases e.g.,PostgreSQL,MySQL,MongoDB,RedisGood working knowledge of React for UI collaboration or development


In [23]:
for col in ['Title', 'Company', 'Location']:
    df[col] = df[col].astype(str).str.strip().str.replace(r'[^\w\s]', '', regex=True).str.title()


In [24]:
df.to_csv("cleaned_job_data.csv", index=False)