In [None]:
!pip install sentence-transformers

In [None]:
import re
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
import lightgbm as lgbm
import joblib

# -----------------------------
#  TextProcessor class
# -----------------------------
class TextProcessor:
    def __init__(self):
        # Load the pre-trained SentenceTransformer model once
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def process_list(self, text):
        try:
            if pd.isna(text) or text == '':
                return []
            return literal_eval(text)
        except:
            return text.split(',')
    
    def get_embeddings(self, texts):
        texts = [str(t) if not pd.isna(t) else '' for t in texts]
        return self.model.encode(texts)

# -----------------------------
#  FeatureEngineer class
# -----------------------------
class FeatureEngineer:
    def __init__(self):
        self.text_processor = TextProcessor()
        self.label_encoders = {}
    
    def extract_year(self, date_str):
        if pd.isna(date_str):
            return None
        try:
            return int(re.findall(r'\d{4}', str(date_str))[0])
        except:
            return None
    
    def process_dates(self, df):
        # Experience duration
        df['experience_years'] = df.apply(
            lambda x: self.extract_year(x['end_dates']) - self.extract_year(x['start_dates'])
            if self.extract_year(x['end_dates']) and self.extract_year(x['start_dates'])
            else 0, axis=1
        )
        return df
    
    def process_categorical(self, df, col):
        if col not in self.label_encoders:
            self.label_encoders[col] = LabelEncoder()
            df[f'{col}_encoded'] = self.label_encoders[col].fit_transform(df[col].fillna('MISSING'))
        else:
            df[f'{col}_encoded'] = self.label_encoders[col].transform(df[col].fillna('MISSING'))
        return df
    
    def transform(self, df):
        # Process dates
        df = self.process_dates(df)
        
        # Process categorical
        for col in ['degree_names', 'result_types', 'major_field_of_studies']:
            df = self.process_categorical(df, col)
        
        # Get embeddings for text features
        text_features = ['skills', 'career_objective', 'responsibilities']
        embedding_cols = {}
        
        for feature in text_features:
            embeddings = self.text_processor.get_embeddings(df[feature])
            for i in range(embeddings.shape[1]):
                embedding_cols[f'{feature}_emb_{i}'] = embeddings[:, i]
        
        # Concatenate all embeddings at once
        embedding_df = pd.DataFrame(embedding_cols, index=df.index)
        df = pd.concat([df, embedding_df], axis=1)
        
        # Skills matching score
        df['skills_required'] = df['skills_required'].fillna('')
        df['skills'] = df['skills'].fillna('')
        required_skills = df['skills_required'].apply(self.text_processor.process_list)
        candidate_skills = df['skills'].apply(self.text_processor.process_list)
        
        df['skills_match_ratio'] = [
            len(set(req).intersection(set(cand))) / len(set(req)) if len(set(req)) > 0 else 0
            for req, cand in zip(required_skills, candidate_skills)
        ]
        
        return df


In [None]:
# -----------------------------
#  TextProcessor class
# -----------------------------
class TextProcessor:
    def __init__(self):
        # Load the pre-trained SentenceTransformer model once
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def process_list(self, text):
        try:
            if pd.isna(text) or text == '':
                return []
            return literal_eval(text)
        except:
            return text.split(',')
    
    def get_embeddings(self, texts):
        texts = [str(t) if not pd.isna(t) else '' for t in texts]
        return self.model.encode(texts)

# -----------------------------
#  FeatureEngineer class
# -----------------------------
class FeatureEngineer:
    def __init__(self):
        self.text_processor = TextProcessor()
        self.label_encoders = {}
    
    def extract_year(self, date_str):
        if pd.isna(date_str):
            return None
        try:
            return int(re.findall(r'\d{4}', str(date_str))[0])
        except:
            return None
    
    def process_dates(self, df):
        # Experience duration
        df['experience_years'] = df.apply(
            lambda x: self.extract_year(x['end_dates']) - self.extract_year(x['start_dates'])
            if self.extract_year(x['end_dates']) and self.extract_year(x['start_dates'])
            else 0, axis=1
        )
        return df
    
    def process_categorical(self, df, col):
        if col not in self.label_encoders:
            self.label_encoders[col] = LabelEncoder()
            df[f'{col}_encoded'] = self.label_encoders[col].fit_transform(df[col].fillna('MISSING'))
        else:
            df[f'{col}_encoded'] = self.label_encoders[col].transform(df[col].fillna('MISSING'))
        return df
    
    def transform(self, df):
        # Process dates
        df = self.process_dates(df)
        
        # Process categorical
        for col in ['degree_names', 'result_types', 'major_field_of_studies']:
            df = self.process_categorical(df, col)
        
        # Get embeddings for text features
        text_features = ['skills', 'career_objective', 'responsibilities']
        embedding_cols = {}
        
        for feature in text_features:
            embeddings = self.text_processor.get_embeddings(df[feature])
            for i in range(embeddings.shape[1]):
                embedding_cols[f'{feature}_emb_{i}'] = embeddings[:, i]
        
        # Concatenate all embeddings at once
        embedding_df = pd.DataFrame(embedding_cols, index=df.index)
        df = pd.concat([df, embedding_df], axis=1)
        
        # Skills matching score
        df['skills_required'] = df['skills_required'].fillna('')
        df['skills'] = df['skills'].fillna('')
        required_skills = df['skills_required'].apply(self.text_processor.process_list)
        candidate_skills = df['skills'].apply(self.text_processor.process_list)
        
        df['skills_match_ratio'] = [
            len(set(req).intersection(set(cand))) / len(set(req)) if len(set(req)) > 0 else 0
            for req, cand in zip(required_skills, candidate_skills)
        ]
        
        return df


In [None]:
# Load Data
train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')

# Initialize the Feature Engineer
fe = FeatureEngineer()

# Transform the data
print("Transforming train data...")
train_df = fe.transform(train_df)

# Select features and target
emb_cols = [col for col in train_df.columns if 'emb_' in col]  # e.g., 'skills_emb_0', etc.
cat_cols = [col for col in train_df.columns if 'encoded' in col]
num_cols = ['experience_years', 'skills_match_ratio']
feature_cols = emb_cols + cat_cols + num_cols

X = train_df[feature_cols]
y = train_df['matched_score']

In [None]:
# from sklearn.model_selection import train_test_split

# # Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create LightGBM datasets
# train_data = lgbm.Dataset(X_train, label=y_train)
# val_data = lgbm.Dataset(X_val, label=y_val, reference=train_data)
# # LightGBM Parameters
# params = {
#     'objective': 'regression_l2',
#     'metric': 'l2',
#     'num_leaves': 31,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'max_depth': 8,
#     'reg_alpha': 0.1,
#     'reg_lambda': 0.1,
#     'device': 'gpu',  # Specify GPU usage
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
#     'gpu_use_dp': True
# }
# # Initialize KFold
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# cv_scores = []
# test_preds = np.zeros(len(test_df))

# # Cross-Validation Loop
# for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
#     print(f"Training fold {fold + 1}")
    
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
#     train_data = lgbm.Dataset(X_train, label=y_train)
#     val_data = lgbm.Dataset(X_val, label=y_val)
    
#     model = lgbm.train(
#         params,
#         train_data,
#         num_boost_round=1000,
#         valid_sets=[train_data, val_data],
#         valid_names=['train', 'valid'],
#         callbacks=[
#             lgbm.early_stopping(stopping_rounds=50),
#             lgbm.log_evaluation(100)
#         ]
#     )
    
#     # Evaluate the fold
#     val_preds = model.predict(X_val)
#     fold_mse = mean_squared_error(y_val, val_preds)
#     cv_scores.append(fold_mse)
#     print(f"Fold {fold + 1} MSE: {fold_mse:.6f}")
    
#     # Predict on test data
#     test_preds += model.predict(test_df[feature_cols]) / kf.n_splits

# # Print overall CV results
# print(f"\nAverage CV MSE: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")

# # Save the final model from the last fold
# model.save_model('lgbm_model.txt')
# print("LightGBM model saved to lgbm_model.txt")

# # Save the FeatureEngineer
# joblib.dump(fe, 'feature_engineer.pkl')
# print("FeatureEngineer saved to feature_engineer.pkl")


In [None]:
from sklearn.metrics import mean_squared_error
import joblib
import lightgbm as lgbm

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM datasets
train_data = lgbm.Dataset(X_train, label=y_train)
val_data = lgbm.Dataset(X_val, label=y_val, reference=train_data)

# LightGBM Parameters
params = {
    'objective': 'regression_l2',
    'metric': 'l2',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'max_depth': 8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'device': 'gpu',  # Specify GPU usage
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'gpu_use_dp': True
}

# Train the model with validation
model = lgbm.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, val_data],  # Add validation dataset here
    valid_names=['train', 'valid'],    # Name the datasets
    callbacks=[
        lgbm.early_stopping(stopping_rounds=50),  # Stop if no improvement in 50 rounds
        lgbm.log_evaluation(100)                 # Log every 100 iterations
    ]
)

# Predict on validation set to compute validation MSE
val_preds = model.predict(X_val)
val_mse = mean_squared_error(y_val, val_preds)
print(f"Validation MSE: {val_mse:.6f}")

# Save the model
model.save_model('lgbm_model.txt')
print("LightGBM model saved to lgbm_model.txt")

# Save the FeatureEngineer (fe) pipeline if needed for later preprocessing
joblib.dump(fe, 'feature_engineer.pkl')
print("FeatureEngineer saved to feature_engineer.pkl")


In [None]:
from sklearn.metrics import mean_squared_error

# Predict on the validation set
val_preds = model.predict(X_val)

# Calculate MSE
val_mse = mean_squared_error(y_val, val_preds)
print(f"Validation MSE: {val_mse:.6f}")


In [None]:
# Load Feature Engineer and Model
fe = joblib.load('feature_engineer.pkl')
print("Loaded FeatureEngineer from feature_engineer.pkl")
model = lgbm.Booster(model_file='lgbm_model.txt')
print("Loaded LightGBM model from lgbm_model.txt")

# Load new test data
test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')

# Transform the new data
print("Transforming test data...")
test_df = fe.transform(test_df)

# Select features for inference
emb_cols = [col for col in test_df.columns if 'emb_' in col]
cat_cols = [col for col in test_df.columns if 'encoded' in col]
num_cols = ['experience_years', 'skills_match_ratio']
feature_cols = emb_cols + cat_cols + num_cols

# Predict
predictions = model.predict(test_df[feature_cols])

# Prepare the submission
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'matched_score': predictions
})
submission.to_csv('inference_predictions_2.csv', index=False)
print("Predictions saved to inference_predictions.csv")


In [None]:
    # submission = pd.DataFrame({
    #     'ID': test_df['ID'],
    #     'matched_score': test_preds
    # })
    # submission.to_csv('submission.csv', index=False)
    # print("Submission saved to submission.csv")

In [None]:
if __name__ == "__main__":
    train_model()