In [1]:
import pandas as pd
import re


df = pd.read_csv('linkedin_job_listings_enhanced.csv')


company_headquarters = {
    'Chargebee': 'Chennai, India',
    'Plum Insurance': 'Bangalore, India',
    'Yellow.ai': 'San Mateo, USA',
    'Graphy': 'Bangalore, India',
    'Kala.ai': 'Nicosia, Cyprus',
    'Razorpay': 'Bangalore, India',
    'BrowserStack': 'Mumbai, India',
    'Meesho': 'Bangalore, India'
}


def clean_company_name(name):
    return re.sub(r'\s*hiring.*', '', name).strip()

def get_headquarters(company_name):
    clean_name = clean_company_name(company_name)
    return company_headquarters.get(clean_name, 'Unknown')


df['company_name'] = df['company_name'].apply(clean_company_name)
df['location'] = df['company_name'].apply(get_headquarters)


df.to_csv('location changed datas.csv', index=False)

print(f"Saved data with standardized company locations. {len(company_headquarters)} companies mapped.")

Saved data with standardized company locations. 8 companies mapped.


In [2]:
import pandas as pd


df1 = pd.read_csv('linkedin_job_listings_cleaned.csv')  
df2 = pd.read_csv('location changed datas.csv')  


merged_df = pd.merge(
    df1[['company_name', 'department', 'seniority', 'job_url', 
         'growth_signal_tag', 'processed_timestamp',
         'growth_tag_remote_hiring_focus', 'growth_tag_product_team_buildout',
         'job_title_clean']],
    df2[['job_url', 'location']], 
    on='job_url',
    how='inner'  
)

print(f"Duplicate job URLs in merged data: {merged_df['job_url'].duplicated().sum()}")

final_columns = [
    'company_name', 'department', 'seniority', 'job_url', 
    'growth_signal_tag', 'processed_timestamp',
    'growth_tag_remote_hiring_focus', 'growth_tag_product_team_buildout',
    'job_title_clean', 'location'
]
merged_df = merged_df[final_columns]

merged_df.to_csv('final_merged_dataset.csv', index=False)

print("Merging completed successfully!")
print(f"Final dataset shape: {merged_df.shape}")

Duplicate job URLs in merged data: 0
Merging completed successfully!
Final dataset shape: (320, 10)


In [3]:
import pandas as pd
import re


merged_df = merged_df[final_columns]


def clean_job_title(title):
    """
    Clean job titles by removing:
    - Locations (in ...)
    - Special characters and parenthetical text
    - Company names
    - Unwanted prefixes/suffixes
    """
    if pd.isna(title):
        return title
    
  
    title = re.sub(r'\s*(?:in|at|for|on|from|by|near)\s+.*?(?:\s*\.{3})?$', '', title, flags=re.IGNORECASE)
    

    title = re.sub(r'\(.*?\)', '', title)
    title = re.sub(r'[^\w\s-]', ' ', title)
    
    
    title = re.sub(r'^\s*(?:hiring|looking for|seeking|wanted)\s+', '', title, flags=re.IGNORECASE)
    title = re.sub(r'\s*(?:job|position|role|opening)\s*$', '', title, flags=re.IGNORECASE)
    
  
    title = re.sub(r'\s*(?:at|from|for)\s+[\w\s]+$', '', title, flags=re.IGNORECASE)
    
  
    title = re.sub(r'\s+', ' ', title).strip()
    title = title.title()  
    
    return title


df = pd.read_csv('final_merged_dataset.csv')


df['clean_title'] = df['job_title_clean'].apply(clean_job_title)


print("Before cleaning vs After cleaning:")
for orig, clean in zip(df['job_title_clean'].head(10), df['clean_title'].head(10)):
    print(f"Original: {orig}\nCleaned: {clean}\n")

df.to_csv('final merged and clean.csv', index=False)

Before cleaning vs After cleaning:
Original: Product Marketing Manager In Salt Lake ...
Cleaned: Product Marketing Manager

Original: Staff Product Manager In Chennai, Tamil ...
Cleaned: Staff Product Manager

Original: Large Enterprise Program Manager
Cleaned: Large Enterprise Program Manager

Original: Product Marketing Manager -Saas
Cleaned: Product Marketing Manager -Saas

Original: Product Marketing Manager
Cleaned: Product Marketing Manager

Original: Business Development Representative
Cleaned: Business Development Representative

Original: Business Development Representative
Cleaned: Business Development Representative

Original: Business Development Representative Emea
Cleaned: Business Development Representative Emea

Original: Enterprise Business Development Representative
Cleaned: Enterprise Business Development Representative

Original: Enterprise Account Executive In San Francisco ...
Cleaned: Enterprise Account Executive



In [4]:
df = pd.read_csv('final merged and clean.csv')

df = df.drop('job_title_clean', axis=1)


df = df.rename(columns={'clean_title': 'Job_Title'})


desired_order = [
    'company_name',
    'Job_Title',
    'department',
    'location',
    'seniority',
    'job_url',
    'growth_signal_tag',
    'processed_timestamp',
    'growth_tag_remote_hiring_focus',
    'growth_tag_product_team_buildout'
]

df = df[desired_order]

df.to_csv('cleaned datas.csv', index=False)

In [6]:
import pandas as pd


df1 = pd.read_csv('cleaned datas.csv')
df2 = pd.read_csv('extra cleaned columns.csv')

growth_tags = [
    'New Product Initiative',
    'Engineering Buildout', 
    'Design Overhaul',
    'Go-To-Market Expansion',
    'General Hiring'
]


missing_tags = [tag for tag in growth_tags if tag not in df2.columns]
if missing_tags:
    print(f"Warning: These growth tags not found in second dataset: {missing_tags}")
    growth_tags = [tag for tag in growth_tags if tag in df2.columns]

merged_df = pd.merge(
    df1,
    df2[['job_url'] + growth_tags],  
    on='job_url',
    how='left' 
)

if merged_df['job_url'].duplicated().any():
    print("Warning: Duplicate job_url values detected after merge")
    merged_df = merged_df.drop_duplicates(subset=['job_url'])


merged_df.to_csv('Final_Cleaned_Datasets.csv', index=False)

print("Merge completed successfully!")
print(f"Added columns: {growth_tags}")
print(f"New dataset shape: {merged_df.shape}")

Merge completed successfully!
Added columns: ['New Product Initiative', 'Engineering Buildout', 'Design Overhaul', 'Go-To-Market Expansion', 'General Hiring']
New dataset shape: (320, 15)
