# Importing the data

In [34]:
import pandas as pd

# Load the newly uploaded dataset to analyze its structure and identify key columns for stemming
file_path = 'Data.csv'
data = pd.read_csv(file_path)

df = pd.DataFrame(data)

# Display the first few rows and summary information to identify key columns for stemming
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Job Title                489 non-null    object 
 1   Applicant ID             500 non-null    int64  
 2   Job Experience Required  500 non-null    object 
 3   Key Skills               500 non-null    object 
 4   Role Category            463 non-null    object 
 5   Location                 489 non-null    object 
 6   Functional Area          489 non-null    object 
 7   Industry                 489 non-null    object 
 8   Role                     486 non-null    object 
 9   Longitude                473 non-null    float64
 10  Latitude                 473 non-null    float64
 11  sal                      500 non-null    int64  
dtypes: float64(2), int64(2), object(8)
memory usage: 47.0+ KB


(                                           Job Title  Applicant ID  \
 0                              Digital Media Planner             1   
 1                           Online Bidding Executive             2   
 2   Trainee Research/ Research Executive- Hi- Tec...             3   
 3                                  Technical Support             4   
 4                  Software Test Engineer -hyderabad             5   
 
   Job Experience Required                                         Key Skills  \
 0              5 - 10 yrs                      Media Planning| Digital Media   
 1               2 - 5 yrs   pre sales| closing| software knowledge| clien...   
 2               0 - 1 yrs   Computer science| Fabrication| Quality check|...   
 3               0 - 5 yrs                                  Technical Support   
 4               2 - 5 yrs   manual testing| test engineering| test cases|...   
 
                                 Role Category   Location  \
 0                     

# Stemming the Data

In [35]:
import nltk
nltk.download('punkt_tab')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd

# Download required NLTK resources

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Function to apply stemming
def stem_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""
    words = word_tokenize(text)  # Tokenize the text
    stemmed_words = [stemmer.stem(word) for word in words]  # Apply stemming
    return " ".join(stemmed_words)  # Rejoin words into a string

# Apply stemming to the relevant columns
for column in ['Job Title', 'Key Skills', 'Functional Area','Role Category' ]:
    df[f'{column} Stemmed'] = df[column].apply(stem_text)

# df = pd.DataFrame(data)
# Display the updated DataFrame
df[['Job Title', 'Job Title Stemmed', 'Key Skills', 'Key Skills Stemmed', 'Functional Area', 'Functional Area Stemmed','Role Category','Role Category Stemmed']].head()


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Job Title,Job Title Stemmed,Key Skills,Key Skills Stemmed,Functional Area,Functional Area Stemmed,Role Category,Role Category Stemmed
0,Digital Media Planner,digit media planner,Media Planning| Digital Media,media planning| digit media,"Marketing , Advertising , MR , PR , Media Plan...","market , advertis , mr , pr , media plan",Advertising,advertis
1,Online Bidding Executive,onlin bid execut,pre sales| closing| software knowledge| clien...,pre sales| closing| softwar knowledge| clients...,"Sales , Retail , Business Development","sale , retail , busi develop",Retail Sales,retail sale
2,Trainee Research/ Research Executive- Hi- Tec...,traine research/ research executive- hi- tech ...,Computer science| Fabrication| Quality check|...,comput science| fabrication| qualiti check| in...,"Engineering Design , R&D","engin design , r & d",R&D,r & d
3,Technical Support,technic support,Technical Support,technic support,"IT Software - Application Programming , Mainte...","it softwar - applic program , mainten",Admin/Maintenance/Security/Datawarehousing,admin/maintenance/security/datawareh
4,Software Test Engineer -hyderabad,softwar test engin -hyderabad,manual testing| test engineering| test cases|...,manual testing| test engineering| test cases| ...,IT Software - QA & Testing,it softwar - qa & test,Programming & Design,program & design


# Normalising the Salary and Adjusting maximum and minimum Experience


In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['sal_normalized'] = scaler.fit_transform(data[['sal']])

df[['min_experience', 'max_experience']] = data['Job Experience Required'].str.extract(r'(\d+)\s*-\s*(\d+)').astype(float)


# Taking the preprocessed Column and droping the orignal columns

In [37]:
# Replace the original column with the preprocessed one
df['Job Title'] = df['Job Title Stemmed']
df['Functional Area'] = df['Functional Area Stemmed']
df['Key Skills'] = df['Key Skills Stemmed']
df['Role Category'] = df['Role Category Stemmed']

# Drop temporary columns (if needed)
df.drop(['Job Title Stemmed', 'Functional Area Stemmed','Key Skills Stemmed','Role Category Stemmed'], axis=1, inplace=True)

# 'Key Skills', 'Key Skills Stemmed', 'Functional Area', 'Functional Area Stemmed','Role Category','Role Category Stemmed

# IDF Vectorization

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example dataframe (replace with your actual dataframe)
# Ensure the relevant columns are clean and contain textual data
columns_to_vectorize = ['Job Title', 'Functional Area', 'Key Skills', 'Role Category']

# Initialize TfidfVectorizer
vectorizers = {col: TfidfVectorizer() for col in columns_to_vectorize}

# Transform each column
idf_vectors = {}
for col in columns_to_vectorize:
    idf_vectors[col] = vectorizers[col].fit_transform(df[col].fillna(""))

# If you want to concatenate all the vectors into one matrix
from scipy.sparse import hstack
combined_idf_vector = hstack([idf_vectors[col] for col in columns_to_vectorize])

# Example: Access individual column IDF vectors
print(idf_vectors['Job Title'])  # Sparse matrix for 'Job Title'
print(combined_idf_vector)       # Combined sparse matrix


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2168 stored elements and shape (500, 773)>
  Coords	Values
  (0, 226)	0.48872255321502256
  (0, 454)	0.5686893551232056
  (0, 542)	0.6616212537007423
  (1, 507)	0.6637634561919319
  (1, 111)	0.6637634561919319
  (1, 268)	0.34472619344674477
  (2, 716)	0.35310423818569253
  (2, 601)	0.7062084763713851
  (2, 269)	0.304201856999693
  (2, 335)	0.37650593692864315
  (2, 688)	0.26021441753503677
  (2, 513)	0.27309330170993407
  (3, 690)	0.7028234276755185
  (3, 676)	0.7113643437159577
  (4, 653)	0.49270205923616195
  (4, 698)	0.6014452417039887
  (4, 260)	0.3599143632378462
  (4, 352)	0.5157227483747172
  (5, 511)	0.3492318165385127
  (5, 288)	0.24808539183493325
  (5, 29)	0.558230078723352
  (5, 44)	0.558230078723352
  (5, 657)	0.439601109066343
  (6, 288)	0.23401776095917104
  (6, 613)	0.4938464800994421
  :	:
  (495, 690)	0.26662627904350605
  (495, 188)	0.2938381907675843
  (495, 248)	0.3682920618832831
  (495, 94)	0.317066814