In [1]:
import pandas as pd
df = pd.read_csv('resume_dataset_large.csv')
# Shape: (5000, 5) - Company, Designation, Skills, Achievements, Status

In [2]:
# Remove whitespace and quotes
df.columns = df.columns.str.strip()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.strip().str.replace('"', '')

# Filter only accepted candidates
df = df[df['Status'].str.lower() == 'accepted']  # 5000 ‚Üí 3010 records

# Remove missing values
df = df.dropna()

In [3]:
X = df[['Company', 'Designation']].copy()
# Shape: (3010, 2)

In [4]:
# See shapes, types, values at every step
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")
df.info()

Shape: (3010, 5)
Columns: Index(['Company', 'Designation', 'Skills', 'Achievements', 'Status'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 3010 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company       3010 non-null   object
 1   Designation   3010 non-null   object
 2   Skills        3010 non-null   object
 3   Achievements  3010 non-null   object
 4   Status        3010 non-null   object
dtypes: object(5)
memory usage: 141.1+ KB


In [5]:
# Try different approaches quickly
# Approach 1
X = df[['Company', 'Designation']]

# Approach 2
X = df[['Company', 'Designation', 'Achievements']]

# Compare results without rerunning entire script

In [6]:
print(df.columns)


Index(['Company', 'Designation', 'Skills', 'Achievements', 'Status'], dtype='object')


In [7]:
import matplotlib.pyplot as plt

# See skill distribution
skill_counts.plot(kind='bar')
plt.show()  # Appears inline!

NameError: name 'skill_counts' is not defined

In [None]:
# ============================================================
# CELL 1: IMPORTS & SETUP
# ============================================================
"""
What am I doing: Importing libraries
Why: These are tools I need for data processing and ML
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import hamming_loss, accuracy_score
import pickle
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")

# ============================================================
# CELL 2: LOAD DATA
# ============================================================
"""
What: Load CSV file into pandas DataFrame
Why: Need data to train the model
"""

df = pd.read_csv('resume_dataset_large.csv')

# LEARNING TASK: Answer these questions by running code
print(f"How many rows? {len(df)}")
print(f"How many columns? {len(df.columns)}")
print(f"What are the columns? {list(df.columns)}")

# Look at first 5 rows
df.head()

# ============================================================
# CELL 3: EXPLORE THE DATA
# ============================================================
"""
LEARNING EXERCISE: Explore before processing
"""

# Question 1: How many unique companies?
print(f"Unique companies: {df['Company'].nunique()}")
print(df['Company'].value_counts())

# Question 2: How many unique designations?
print(f"Unique designations: {df['Designation'].nunique()}")
print(df['Designation'].value_counts())

# Question 3: What does the Status column look like?
print(df['Status'].value_counts())

# Question 4: Look at one row in detail
print("\nüìã Example Resume:")
print(f"Company: {df.loc[0, 'Company']}")
print(f"Designation: {df.loc[0, 'Designation']}")
print(f"Skills: {df.loc[0, 'Skills']}")
print(f"Status: {df.loc[0, 'Status']}")

# ============================================================
# CELL 4: DATA CLEANING
# ============================================================
"""
What: Clean the data
Why: Remove extra spaces, quotes, missing values
"""

# TASK 1: Clean column names
print("Before:", df.columns.tolist())
df.columns = df.columns.str.strip()
print("After:", df.columns.tolist())

# TASK 2: Clean text data
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.strip().str.replace('"', '')

# Check the result
print("\n‚úÖ Data cleaned!")
df.head()

# ============================================================
# CELL 5: FILTER DATA
# ============================================================
"""
LEARNING QUESTION: Why filter for 'Accepted' only?
ANSWER: [Write your understanding here]
"""

print(f"Before filtering: {len(df)} records")

# Filter for accepted candidates
df = df[df['Status'].str.lower() == 'accepted']

print(f"After filtering: {len(df)} records")
print(f"Removed: {5000 - len(df)} rejected candidates")

# ============================================================
# CELL 6: PREPARE INPUT FEATURES (X)
# ============================================================
"""
CONCEPT: Features = INPUT to the model
Features = What we KNOW (Company, Designation)
"""

X = df[['Company', 'Designation']].copy()

print(f"Input shape: {X.shape}")
print(f"This means: {X.shape[0]} rows (people), {X.shape[1]} columns (features)")

# LEARNING TASK: Look at first 10 inputs
X.head(10)

# ============================================================
# CELL 7: PREPARE OUTPUT LABELS (y)
# ============================================================
"""
CONCEPT: Labels = OUTPUT we want to predict
Labels = What we want to LEARN (Skills)
"""

# TASK: Split comma-separated skills into lists
df['Skills_List'] = df['Skills'].apply(
    lambda x: [skill.strip() for skill in str(x).split(',')]
)

# LEARNING: See the transformation
print("Original Skills (text):")
print(df.loc[0, 'Skills'])

print("\nTransformed Skills (list):")
print(df.loc[0, 'Skills_List'])

y = df['Skills_List']

# ============================================================
# CELL 8: ENCODING - COMPANY
# ============================================================
"""
CONCEPT: Computers understand numbers, not text
'Google' ‚Üí 1, 'Amazon' ‚Üí 0, etc.

LEARNING TASK: Understand LabelEncoder
"""

company_encoder = LabelEncoder()

# Fit: Learn all unique companies
company_encoder.fit(X['Company'])

# What did it learn?
print("Companies learned:")
for i, company in enumerate(company_encoder.classes_):
    print(f"  {company} ‚Üí {i}")

# Transform: Convert text to numbers
X['Company_Encoded'] = company_encoder.transform(X['Company'])

# VERIFY: Check the transformation
comparison = pd.DataFrame({
    'Original': X['Company'].head(10),
    'Encoded': X['Company_Encoded'].head(10)
})
print("\nüìä Encoding Verification:")
print(comparison)

# ============================================================
# CELL 9: ENCODING - DESIGNATION
# ============================================================
"""
TASK: Do the same for Designation
TRY YOURSELF: Complete this cell
"""

designation_encoder = LabelEncoder()

# YOUR CODE HERE:
# 1. Fit the encoder
# 2. Print the classes
# 3. Transform and add to X
# 4. Verify the transformation

# ============================================================
# CELL 10: ENCODING - SKILLS (Multi-Label)
# ============================================================
"""
CONCEPT: MultiLabelBinarizer
Input: ["Python", "SQL", "AWS"]
Output: [0, 0, 1, 0, 1, 0, 1, 0, ...]
         ^     ^        ^     ^
         |     |        |     |
      Not    Not     Has    Has
      skill1 skill2  Python  SQL
"""

mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(y)

# LEARNING: Understand the output
print(f"Output shape: {y_encoded.shape}")
print(f"Interpretation: {y_encoded.shape[0]} people, {y_encoded.shape[1]} possible skills")

print(f"\nAll possible skills ({len(mlb.classes_)}):")
print(mlb.classes_)

# VISUAL: See one person's encoding
person_idx = 0
print(f"\nüë§ Person 0's skills:")
print(f"Original: {y.iloc[person_idx]}")
print(f"Encoded: {y_encoded[person_idx]}")
print(f"Interpretation:")
for i, skill in enumerate(mlb.classes_):
    if y_encoded[person_idx][i] == 1:
        print(f"  ‚úì Has {skill}")

# ============================================================
# CELL 11: PREPARE FINAL INPUTS
# ============================================================
"""
Combine encoded features
"""

X_encoded = X[['Company_Encoded', 'Designation_Encoded']]

print(f"Final input shape: {X_encoded.shape}")
print(f"Final output shape: {y_encoded.shape}")

# SANITY CHECK
assert X_encoded.shape[0] == y_encoded.shape[0], "Rows must match!"
print("‚úÖ Input and output sizes match!")

# ============================================================
# CELL 12: TRAIN-TEST SPLIT
# ============================================================
"""
CONCEPT: Split data into training (80%) and testing (20%)
Why? To test if model learned or just memorized
"""

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y_encoded,
    test_size=0.2,
    random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# LEARNING: What percentage is testing?
test_percentage = (len(X_test) / len(X_encoded)) * 100
print(f"Test percentage: {test_percentage:.1f}%")

# ============================================================
# CELL 13: CREATE MODEL
# ============================================================
"""
CONCEPT: Random Forest = 100 decision trees voting
"""

model = RandomForestClassifier(
    n_estimators=100,      # 100 trees
    random_state=42,       # For reproducibility
    n_jobs=-1,             # Use all CPU cores
    max_depth=20,          # How deep each tree
    min_samples_split=5    # Minimum samples to split
)

print("‚úÖ Model created (not trained yet)")
print(f"Model has {model.n_estimators} trees")

# ============================================================
# CELL 14: TRAIN MODEL
# ============================================================
"""
This is where LEARNING happens!
Model looks at X_train and y_train
Finds patterns: "When Company=1 and Designation=2, usually skills=[1,0,1,...]"
"""

import time

print("üéì Training started...")
start_time = time.time()

model.fit(X_train, y_train)

elapsed = time.time() - start_time
print(f"‚úÖ Training completed in {elapsed:.2f} seconds")

# ============================================================
# CELL 15: MAKE PREDICTIONS
# ============================================================
"""
Test the model on data it has NEVER seen
"""

y_pred = model.predict(X_test)

print(f"Predictions shape: {y_pred.shape}")
print(f"Made predictions for {len(y_pred)} people")

# LEARNING: Look at one prediction
person_idx = 0
print(f"\nüìä Example Prediction:")
print(f"Actual skills: {y_test[person_idx]}")
print(f"Predicted skills: {y_pred[person_idx]}")

# Count how many matched
matches = (y_test[person_idx] == y_pred[person_idx]).sum()
total = len(y_test[person_idx])
print(f"Matched: {matches}/{total} skills")

# ============================================================
# CELL 16: EVALUATE MODEL
# ============================================================
"""
CONCEPT: How good is the model?
"""

hamming = hamming_loss(y_test, y_pred)
individual_accuracy = 1 - hamming

print(f"Hamming Loss: {hamming:.4f}")
print(f"Individual Accuracy: {individual_accuracy:.4f} = {individual_accuracy*100:.2f}%")

print(f"\nInterpretation:")
print(f"  Out of 100 skill predictions, {int(individual_accuracy*100)} are correct")
print(f"  Out of 100 skill predictions, {int(hamming*100)} are wrong")

# ============================================================
# CELL 17: TEST WITH REAL EXAMPLE
# ============================================================
"""
Let's test: What skills for Google Data Scientist?
"""

# Input
company = "Google"
designation = "Data Scientist"

# Encode
company_encoded = company_encoder.transform([company])[0]
designation_encoded = designation_encoder.transform([designation])[0]

print(f"{company} ‚Üí {company_encoded}")
print(f"{designation} ‚Üí {designation_encoded}")

# Create input array
input_data = np.array([[company_encoded, designation_encoded]])

# Predict
prediction = model.predict(input_data)[0]

# Decode back to skill names
predicted_skills = []
for i, has_skill in enumerate(prediction):
    if has_skill == 1:
        predicted_skills.append(mlb.classes_[i])

print(f"\nüéØ Predicted skills for {designation} at {company}:")
for skill in predicted_skills:
    print(f"  ‚úì {skill}")

# ============================================================
# CELL 18: CALCULATE CONFIDENCE
# ============================================================
"""
ADVANCED: Get confidence scores from all trees
"""

# Get predictions from all 100 trees
all_tree_predictions = np.array([
    tree.predict(input_data)[0] 
    for tree in model.estimators_
])

# Calculate confidence (% of trees that voted yes)
confidence = all_tree_predictions.mean(axis=0)

# Get top 10 skills by confidence
top_indices = np.argsort(confidence)[::-1][:10]

print(f"üèÜ Top 10 Skills with Confidence:")
for i, idx in enumerate(top_indices, 1):
    skill = mlb.classes_[idx]
    conf = confidence[idx]
    print(f"  {i}. {skill}: {conf*100:.1f}%")

‚úÖ Libraries imported successfully!
How many rows? 5000
How many columns? 5
What are the columns? ['Company', 'Designation', 'Skills', 'Achievements', 'Status']
Unique companies: 5
Company
Microsoft     1019
Amazon        1000
Salesforce    1000
Infosys        997
Google         984
Name: count, dtype: int64
Unique designations: 5
Designation
DevOps Engineer      1035
Data Scientist       1010
Software Engineer    1005
Product Manager       992
Cloud Architect       958
Name: count, dtype: int64
Status
Accepted    3010
Rejected    1990
Name: count, dtype: int64

üìã Example Resume:
Company: Microsoft
Designation: Cloud Architect
Skills: Machine Learning, C#, AWS, Scalability, Azure, Terraform, Network Security, Python
Status: Accepted
Before: ['Company', 'Designation', 'Skills', 'Achievements', 'Status']
After: ['Company', 'Designation', 'Skills', 'Achievements', 'Status']

‚úÖ Data cleaned!
Before filtering: 5000 records
After filtering: 3010 records
Removed: 1990 rejected candidate

KeyError: "['Designation_Encoded'] not in index"

In [None]:
# EXERCISE 1: After Cell 3
"""
TASK: Answer these questions by writing code:
1. Which company appears most in the dataset?
2. What percentage of total are "Accepted"?
3. How many skills does the average person have?

WRITE YOUR CODE BELOW:
"""

# Your code here


# EXERCISE 2: After Cell 8
"""
TASK: 
1. What number is "Microsoft" encoded as?
2. Create a function that converts company names to numbers
3. What happens if you try to encode a company not in training data?

WRITE YOUR CODE BELOW:
"""

# Your code here
```

---

## ‚úÖ **My Recommended Workflow**
```
Week 1: Jupyter Learning
‚îú‚îÄ‚îÄ Day 1: Cells 1-7 (Data loading, cleaning, preparation)
‚îú‚îÄ‚îÄ Day 2: Cells 8-11 (Encoding)
‚îú‚îÄ‚îÄ Day 3: Cells 12-16 (Training, evaluation)
‚îî‚îÄ‚îÄ Day 4: Cells 17-18 (Prediction, confidence)

Week 2: Deeper Understanding
‚îú‚îÄ‚îÄ Day 5: Experiments (change parameters, compare results)
‚îú‚îÄ‚îÄ Day 6: Add visualizations (plot skill distributions, confusion matrix)
‚îî‚îÄ‚îÄ Day 7: Document your learnings

Week 3: Production
‚îú‚îÄ‚îÄ Day 8-9: Convert to VS Code .py script
‚îî‚îÄ‚îÄ Day 10: Integrate with Flask backend
`````````

SyntaxError: invalid character '‚îú' (U+251C) (2665236516.py, line 32)