In [2]:
# Install imbalanced-learn if not already installed
!pip install imbalanced-learn --quiet

# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# Import imblearn for handling class imbalance
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# For saving pipeline objects
import joblib

print("All libraries imported successfully!")



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


All libraries imported successfully!


In [5]:
# Import necessary libraries
import pandas as pd
import zipfile
import os

# Define paths
zip_path = r"C:\Users\deepu\OneDrive\Desktop\Infosys Internship\archive (1).zip"
extract_path = r"C:\Users\deepu\OneDrive\Desktop\Infosys Internship\extracted"

# Create extraction folder if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Extract all files from the ZIP
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List files in the extraction folder to find the CSV
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)

# Load the CSV (assuming there's only one CSV in the ZIP)
csv_file = [f for f in extracted_files if f.endswith('.csv')][0]
csv_path = os.path.join(extract_path, csv_file)

df = pd.read_csv(csv_path)

# Display first 5 rows
df.head()


Extracted files: ['AI_Resume_Screening.csv']


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


In [7]:
# Select features and target
features = ['Skills', 'Education', 'Certifications', 'Experience (Years)']
target = 'Job Role'

X = df[features].copy()
y = df[target].copy()

# Handle missing values
X['Certifications'] = X['Certifications'].fillna('None')  # Replace NaN with 'None'

# Optional: confirm no missing values remain
print("Missing values per column:")
print(X.isnull().sum())

# Display first 5 rows
X.head()


Missing values per column:
Skills                0
Education             0
Certifications        0
Experience (Years)    0
dtype: int64


Unnamed: 0,Skills,Education,Certifications,Experience (Years)
0,"TensorFlow, NLP, Pytorch",B.Sc,,10
1,"Deep Learning, Machine Learning, Python, SQL",MBA,Google ML,10
2,"Ethical Hacking, Cybersecurity, Linux",MBA,Deep Learning Specialization,1
3,"Python, Pytorch, TensorFlow",B.Tech,AWS Certified,7
4,"SQL, React, Java",PhD,,4


In [9]:
# Define categorical and text columns
categorical_cols = ['Education', 'Certifications']
text_cols = ['Skills']
numeric_cols = ['Experience (Years)']  # will be passed through

# ColumnTransformer for encoding and TF-IDF
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('tfidf', TfidfVectorizer(max_features=100), 'Skills')
    ],
    remainder='passthrough'  # keep numeric columns as-is
)

# SMOTE for handling class imbalance
smote = SMOTE(random_state=42)

# Build full pipeline
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote)
])

print("Preprocessing pipeline built successfully!")


Preprocessing pipeline built successfully!


In [10]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)


Training shape: (800, 4)
Testing shape: (200, 4)


In [11]:
# Fit pipeline on training data and apply SMOTE
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

# Transform test set (do NOT apply SMOTE to test)
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)

print("Transformed training shape:", X_train_resampled.shape)
print("Transformed testing shape:", X_test_transformed.shape)


Transformed training shape: (840, 25)
Transformed testing shape: (200, 25)


In [12]:
# Save the preprocessing pipeline for future use
joblib.dump(pipeline, 'milestone2_pipeline.pkl')
print("Milestone 2 pipeline saved successfully!")


Milestone 2 pipeline saved successfully!
