In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

def create_sample_data():
    """Create sample training data - in practice, you'd use real resume data"""
    np.random.seed(42)
    
    sample_size = 1000
    data = []
    
    for i in range(sample_size):
        # Simulate features
        skill_match = np.random.beta(2, 2)  # Most candidates have moderate skill match
        experience_level = np.random.exponential(0.5)  # Few candidates have high experience
        education_level = np.random.choice([0.2, 0.4, 0.6, 0.8, 1.0], p=[0.1, 0.2, 0.4, 0.2, 0.1])
        
        # Simulate target variable (compatibility)
        base_score = (skill_match * 0.5 + min(experience_level, 1) * 0.3 + education_level * 0.2) * 100
        noise = np.random.normal(0, 10)
        final_score = max(0, min(100, base_score + noise))
        
        # Binary classification: suitable (1) or not (0)
        suitable = 1 if final_score >= 70 else 0
        
        data.append({
            'skill_match': skill_match,
            'experience_level': min(experience_level, 1),
            'education_level': education_level,
            'compatibility_score': final_score,
            'suitable': suitable
        })
    
    return pd.DataFrame(data)

def train_and_save_model():
    """Train the ML model and save it"""
    print("Creating sample training data...")
    df = create_sample_data()
    
    # Prepare features and target
    X = df[['skill_match', 'experience_level', 'education_level']]
    y = df['suitable']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    print("Training Random Forest model...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Create model directory if it doesn't exist
    os.makedirs('model', exist_ok=True)
    
    # Save model
    joblib.dump(model, 'model/trained_model.pkl')
    print("Model saved to 'model/trained_model.pkl'")
    
    return model

if __name__ == "__main__":
    train_and_save_model()

Creating sample training data...
Training Random Forest model...
Model accuracy: 0.895

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       173
           1       0.67      0.44      0.53        27

    accuracy                           0.90       200
   macro avg       0.79      0.70      0.74       200
weighted avg       0.88      0.90      0.89       200

Model saved to 'model/trained_model.pkl'
