In [None]:
!pip install lightgbm pandas scikit-learn joblib
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
import joblib


### Feature Engineering

In [None]:
class FeatureEngineer:
    def __init__(self):
        self.label_encoders = {}
        self.tfidf_vectorizers = {}
    
    def extract_year(self, date_str):
        if pd.isna(date_str):
            return None
        try:
            return int(re.findall(r'\d{4}', str(date_str))[0])
        except:
            return None
    
    def process_dates(self, df):
        df['experience_years'] = df.apply(
            lambda x: self.extract_year(x['end_dates']) - self.extract_year(x['start_dates'])
            if self.extract_year(x['end_dates']) and self.extract_year(x['start_dates'])
            else 0, axis=1
        )
        return df
    
    def process_categorical(self, df, col):
        if col not in self.label_encoders:
            self.label_encoders[col] = LabelEncoder()
            df[f'{col}_encoded'] = self.label_encoders[col].fit_transform(df[col].fillna('MISSING'))
        else:
            df[f'{col}_encoded'] = self.label_encoders[col].transform(df[col].fillna('MISSING'))
        return df
    
    def process_text(self, df, col):
        if col not in self.tfidf_vectorizers:
            self.tfidf_vectorizers[col] = TfidfVectorizer(max_features=100)
            tfidf_features = self.tfidf_vectorizers[col].fit_transform(df[col].fillna(''))
        else:
            tfidf_features = self.tfidf_vectorizers[col].transform(df[col].fillna(''))
        tfidf_df = pd.DataFrame(tfidf_features.toarray(), index=df.index, 
                                columns=[f'{col}_tfidf_{i}' for i in range(tfidf_features.shape[1])])
        return tfidf_df
    
    def process_skills_match(self, df):
        df['skills_required'] = df['skills_required'].fillna('')
        df['skills'] = df['skills'].fillna('')
        required_skills = df['skills_required'].apply(lambda x: set(x.split(',')))
        candidate_skills = df['skills'].apply(lambda x: set(x.split(',')))
        df['skills_match_ratio'] = [
            len(req.intersection(cand)) / len(req) if len(req) > 0 else 0
            for req, cand in zip(required_skills, candidate_skills)
        ]
        return df

    def transform(self, df):
        df = self.process_dates(df)
        for col in ['degree_names', 'result_types', 'major_field_of_studies']:
            df = self.process_categorical(df, col)
        text_features = ['skills', 'career_objective', 'responsibilities']
        tfidf_dfs = [self.process_text(df, col) for col in text_features]
        tfidf_combined = pd.concat(tfidf_dfs, axis=1)
        df = self.process_skills_match(df)
        df = pd.concat([df, tfidf_combined], axis=1)
        return df


### Training and Cross-Validation

In [None]:
train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
fe = FeatureEngineer()
train_df = fe.transform(train_df)

tfidf_cols = [col for col in train_df.columns if '_tfidf_' in col]
cat_cols = [col for col in train_df.columns if 'encoded' in col]
num_cols = ['experience_years', 'skills_match_ratio']
feature_cols = tfidf_cols + cat_cols + num_cols

X = train_df[feature_cols]
y = train_df['matched_score']

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    train_data = lgbm.Dataset(X_train, label=y_train)
    val_data = lgbm.Dataset(X_val, label=y_val)

    params = {
        'objective': 'regression_l2',
        'metric': 'l2',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'max_depth': 8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1
    }

    model = lgbm.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, val_data],
        callbacks=[lgbm.early_stopping(stopping_rounds=50), lgbm.log_evaluation(100)]
    )

    val_preds = model.predict(X_val)
    fold_score = mean_squared_error(y_val, val_preds)
    cv_scores.append(fold_score)

print(f"CV MSE: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")
model.save_model('trained_lgbm_model.txt')
joblib.dump(fe, 'trained_feature_engineer.pkl')


### Inference

In [None]:
fe = joblib.load('trained_feature_engineer.pkl')
model = lgbm.Booster(model_file='trained_lgbm_model.txt')
test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')
test_df = fe.transform(test_df)

predictions = model.predict(test_df[feature_cols])
submission = pd.DataFrame({'ID': test_df['ID'], 'matched_score': predictions})
submission.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")
