# Importing the data

In [1]:
import pandas as pd

# Load the newly uploaded dataset to analyze its structure and identify key columns for stemming
file_path = 'Data.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary information to identify key columns for stemming
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Job Title                489 non-null    object 
 1   Applicant ID             500 non-null    int64  
 2   Job Experience Required  500 non-null    object 
 3   Key Skills               500 non-null    object 
 4   Role Category            463 non-null    object 
 5   Location                 489 non-null    object 
 6   Functional Area          489 non-null    object 
 7   Industry                 489 non-null    object 
 8   Role                     486 non-null    object 
 9   Longitude                473 non-null    float64
 10  Latitude                 473 non-null    float64
 11  sal                      500 non-null    int64  
dtypes: float64(2), int64(2), object(8)
memory usage: 47.0+ KB


(                                           Job Title  Applicant ID  \
 0                              Digital Media Planner             1   
 1                           Online Bidding Executive             2   
 2   Trainee Research/ Research Executive- Hi- Tec...             3   
 3                                  Technical Support             4   
 4                  Software Test Engineer -hyderabad             5   
 
   Job Experience Required                                         Key Skills  \
 0              5 - 10 yrs                      Media Planning| Digital Media   
 1               2 - 5 yrs   pre sales| closing| software knowledge| clien...   
 2               0 - 1 yrs   Computer science| Fabrication| Quality check|...   
 3               0 - 5 yrs                                  Technical Support   
 4               2 - 5 yrs   manual testing| test engineering| test cases|...   
 
                                 Role Category   Location  \
 0                     

# Stemming the Data

In [4]:
import nltk
nltk.download('punkt_tab')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd

# Download required NLTK resources

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Function to apply stemming
def stem_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""
    words = word_tokenize(text)  # Tokenize the text
    stemmed_words = [stemmer.stem(word) for word in words]  # Apply stemming
    return " ".join(stemmed_words)  # Rejoin words into a string

# Apply stemming to the relevant columns
for column in ['Job Title', 'Key Skills', 'Functional Area','Role Category' ]:
    data[f'{column} Stemmed'] = data[column].apply(stem_text)

# Display the updated DataFrame
data[['Job Title', 'Job Title Stemmed', 'Key Skills', 'Key Skills Stemmed', 'Functional Area', 'Functional Area Stemmed','Role Category','Role Category Stemmed']].head()


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Job Title,Job Title Stemmed,Key Skills,Key Skills Stemmed,Functional Area,Functional Area Stemmed,Role Category,Role Category Stemmed
0,Digital Media Planner,digit media planner,Media Planning| Digital Media,media planning| digit media,"Marketing , Advertising , MR , PR , Media Plan...","market , advertis , mr , pr , media plan",Advertising,advertis
1,Online Bidding Executive,onlin bid execut,pre sales| closing| software knowledge| clien...,pre sales| closing| softwar knowledge| clients...,"Sales , Retail , Business Development","sale , retail , busi develop",Retail Sales,retail sale
2,Trainee Research/ Research Executive- Hi- Tec...,traine research/ research executive- hi- tech ...,Computer science| Fabrication| Quality check|...,comput science| fabrication| qualiti check| in...,"Engineering Design , R&D","engin design , r & d",R&D,r & d
3,Technical Support,technic support,Technical Support,technic support,"IT Software - Application Programming , Mainte...","it softwar - applic program , mainten",Admin/Maintenance/Security/Datawarehousing,admin/maintenance/security/datawareh
4,Software Test Engineer -hyderabad,softwar test engin -hyderabad,manual testing| test engineering| test cases|...,manual testing| test engineering| test cases| ...,IT Software - QA & Testing,it softwar - qa & test,Programming & Design,program & design


# Normalising the Salary and Adjusting maximum and minimum Experience


In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data['sal_normalized'] = scaler.fit_transform(data[['sal']])

data[['min_experience', 'max_experience']] = data['Job Experience Required'].str.extract(r'(\d+)\s*-\s*(\d+)').astype(float)
