In [None]:
from google.colab import drive
import pandas as pd

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Set the new path (file is in My Drive root)
file_path = '/content/drive/MyDrive/job_descriptions.csv'

# Step 3: Read just the first few rows to get column names
df = pd.read_csv(file_path)
print("Column names:")
print(df.columns.tolist())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Column names:
['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location', 'Country', 'latitude', 'longitude', 'Work Type', 'Company Size', 'Job Posting Date', 'Preference', 'Contact Person', 'Contact', 'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits', 'skills', 'Responsibilities', 'Company', 'Company Profile']


In [None]:
print("\nPreview of the data:")
print(df.head())



Preview of the data:
             Job Id     Experience Qualifications Salary Range    location  \
0  1089843540111562  5 to 15 Years         M.Tech    $59K-$99K     Douglas   
1   398454096642776  2 to 12 Years            BCA   $56K-$116K    Ashgabat   
2   481640072963533  0 to 12 Years            PhD   $61K-$104K       Macao   
3   688192671473044  4 to 11 Years            PhD    $65K-$91K  Porto-Novo   
4   117057806156508  1 to 12 Years            MBA    $64K-$87K    Santiago   

            Country  latitude  longitude  Work Type  Company Size  ...  \
0       Isle of Man   54.2361    -4.5481     Intern         26801  ...   
1      Turkmenistan   38.9697    59.5563     Intern        100340  ...   
2  Macao SAR, China   22.1987   113.5439  Temporary         84525  ...   
3             Benin    9.3077     2.3158  Full-Time        129896  ...   
4             Chile  -35.6751   -71.5429     Intern         53944  ...   

                 Contact                     Job Title  \
0   00

In [None]:
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(' ', '_')
              .str.replace(r'[^\w_]', '', regex=True)  # Remove non-word characters
)

print("Fully cleaned column names:")
print(df.columns.tolist())


Fully cleaned column names:
['job_id', 'experience', 'qualifications', 'salary_range', 'location', 'country', 'latitude', 'longitude', 'work_type', 'company_size', 'job_posting_date', 'preference', 'contact_person', 'contact', 'job_title', 'role', 'job_portal', 'job_description', 'benefits', 'skills', 'responsibilities', 'company', 'company_profile']


In [None]:
print(df.isnull().sum())


job_id                 0
experience             0
qualifications         0
salary_range           0
location               0
country                0
latitude               0
longitude              0
work_type              0
company_size           0
job_posting_date       0
preference             0
contact_person         0
contact                0
job_title              0
role                   0
job_portal             0
job_description        0
benefits               0
skills                 0
responsibilities       0
company                0
company_profile     5478
dtype: int64


In [None]:
df.dropna(inplace=True)

In [None]:
print(df.isnull().sum())

job_id              0
experience          0
qualifications      0
salary_range        0
location            0
country             0
latitude            0
longitude           0
work_type           0
company_size        0
job_posting_date    0
preference          0
contact_person      0
contact             0
job_title           0
role                0
job_portal          0
job_description     0
benefits            0
skills              0
responsibilities    0
company             0
company_profile     0
dtype: int64


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# Save cleaned data back to the original file (overwrite)
df.to_csv('/content/drive/MyDrive/job_descriptions.csv', index=False)

print("Original file in Google Drive has been overwritten with the cleaned version.")


Original file in Google Drive has been overwritten with the cleaned version.


In [None]:
df_check = pd.read_csv('/content/drive/MyDrive/job_descriptions.csv')
print(df_check.head())


             job_id     experience qualifications salary_range    location  \
0  1089843540111562  5 to 15 Years         M.Tech    $59K-$99K     Douglas   
1   398454096642776  2 to 12 Years            BCA   $56K-$116K    Ashgabat   
2   481640072963533  0 to 12 Years            PhD   $61K-$104K       Macao   
3   688192671473044  4 to 11 Years            PhD    $65K-$91K  Porto-Novo   
4   117057806156508  1 to 12 Years            MBA    $64K-$87K    Santiago   

            country  latitude  longitude  work_type  company_size  ...  \
0       Isle of Man   54.2361    -4.5481     Intern         26801  ...   
1      Turkmenistan   38.9697    59.5563     Intern        100340  ...   
2  Macao SAR, China   22.1987   113.5439  Temporary         84525  ...   
3             Benin    9.3077     2.3158  Full-Time        129896  ...   
4             Chile  -35.6751   -71.5429     Intern         53944  ...   

                 contact                     job_title  \
0   001-381-930-7517x737  Di

In [None]:
df.to_csv('job_descriptions_cleaned.csv', index=False)


In [None]:
import zipfile

# Save CSV first
csv_filename = 'job_descriptions_cleaned.csv'
df.to_csv(csv_filename, index=False)

# Compress it into a ZIP file
zip_filename = 'job_descriptions_cleaned.zip'
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_filename)

print("✅ File compressed.")


✅ File compressed.


In [None]:
from google.colab import files
files.download('job_descriptions_cleaned.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>