In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bitfest-datathon-2025/sample_submission.csv
/kaggle/input/bitfest-datathon-2025/train.csv
/kaggle/input/bitfest-datathon-2025/test.csv


In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Load the train and test datasets
new_train_data = pd.read_csv("/kaggle/input/bitfest-datathon-2025/train.csv")
new_test_data = pd.read_csv("/kaggle/input/bitfest-datathon-2025/test.csv")

# Check basic structure of the datasets
print("Train Dataset Overview:\n")
# print(new_train_data.info())
print("\nTest Dataset Overview:\n")
# print(new_test_data.info())

# new_train_data['responsibilities']

Train Dataset Overview:


Test Dataset Overview:



In [13]:
# Step 1: Handle missing values
new_train_data.fillna("Unknown", inplace=True)  # Replace null/None/N/A with "Unknown"
# Alternatively, drop rows/columns with too many missing values
new_train_data.dropna(thresh=int(new_train_data.shape[1] * 0.8), axis=0, inplace=True)  # Drop rows with >80% missing

In [14]:
new_train_data.drop_duplicates(inplace=True)

In [15]:
from sklearn.impute import KNNImputer
import re

# Step 1: Define column groups based on type and relevance
text_columns = ['address','career_objective','locations', 'extra_curricular_activity_types','extra_curricular_organization_links','online_links']
numerical_columns = ['matched_score']
date_columns = ['start_dates', 'end_dates', 'issue_dates', 'expiry_dates']
# Define columns for KNN Imputation
knn_columns = ['age_requirement','experiencere_requirement']  # Add other columns if needed
# Get all column names in the DataFrame
all_columns = set(new_train_data.columns)

# Combine all defined column groups
defined_columns = set(text_columns + numerical_columns + date_columns + knn_columns)

# Identify columns that are not in the defined groups
categorical_columns = list(all_columns - defined_columns)

# Step 2: Preprocessing Function for Age Column
def preprocess_age_requirement(column):
    # Extract numeric ranges and replace non-numeric with NaN
    def extract_mean_age(val):
        if isinstance(val, str):
            # Find ranges like "Age 25 to 35 years" and compute the mean
            match = re.search(r'(\d+)\s*to\s*(\d+)', val)
            if match:
                return (int(match.group(1)) + int(match.group(2))) / 2
            # Find single ages like "Age 25 years"
            match = re.search(r'(\d+)', val)
            if match:
                return int(match.group(1))
        return None  # Return None for non-numeric values

    return column.apply(extract_mean_age)

def preprocess_experience_requirement(column):
    """
    Preprocess the experience_requirement column by extracting numeric ranges or single values
    and replacing non-numeric entries with NaN.
    """
    def extract_mean_experience(val):
        if isinstance(val, str):
            # Find ranges like "3 to 5 years" and compute the mean
            range_match = re.search(r'(\d+)\s*to\s*(\d+)', val)
            if range_match:
                return (int(range_match.group(1)) + int(range_match.group(2))) / 2
            
            # Find "At least X year(s)" or similar patterns
            at_least_match = re.search(r'At least (\d+)', val)
            if at_least_match:
                return int(at_least_match.group(1))
            
            # Find single experience values like "1 year" or "2 year(s)"
            single_match = re.search(r'(\d+)', val)
            if single_match:
                return int(single_match.group(1))
        
        # Return None for non-numeric or unprocessable entries
        return None

    # Apply the extraction logic to the entire column
    return column.apply(extract_mean_experience)

# Step 3: Preprocess Age Requirement
new_train_data['age_requirement'] = preprocess_age_requirement(new_train_data['age_requirement'])
new_test_data['age_requirement'] = preprocess_age_requirement(new_test_data['age_requirement'])
new_train_data['experiencere_requirement'] = preprocess_experience_requirement(new_train_data['experiencere_requirement'])
new_test_data['experiencere_requirement'] = preprocess_experience_requirement(new_test_data['experiencere_requirement'])

# Step 4: Impute Categorical Columns
for col in categorical_columns:
    mode_value = new_train_data[col].mode()[0] if not new_train_data[col].mode().empty else "Not Specified"
    new_train_data[col].fillna(mode_value, inplace=True)
    new_test_data[col].fillna(mode_value, inplace=True)

# Step 5: Impute Text Columns with Placeholder
for col in text_columns:
    new_train_data[col].fillna("No Information", inplace=True)
    new_test_data[col].fillna("No Information", inplace=True)



for col in numerical_columns:
    if col in new_train_data.columns:  # Check if column exists in train data
        median_value = new_train_data[col].median()
        new_train_data[col].fillna(median_value, inplace=True)
    if col in new_test_data.columns:  # Check if column exists in test data
        median_value = new_train_data[col].median()  # Use train data's median for consistency
        new_test_data[col].fillna(median_value, inplace=True)
    else:
        print(f"'{col}' not found in test data.")

# Step 7: Handle Date Columns (Fill with placeholder or special handling)
for col in date_columns:
    new_train_data[col].fillna("Unknown Date", inplace=True)
    new_test_data[col].fillna("Unknown Date", inplace=True)

# Ensure the selected columns are numeric
for col in knn_columns:
    new_train_data[col] = pd.to_numeric(new_train_data[col], errors='coerce')
    new_test_data[col] = pd.to_numeric(new_test_data[col], errors='coerce')

# Check for all-NaN columns and fill temporarily
for col in knn_columns:
    if new_train_data[col].isnull().all():
        new_train_data[col].fillna(0, inplace=True)  # Replace with a temporary value
    if new_test_data[col].isnull().all():
        new_test_data[col].fillna(0, inplace=True)  # Replace with a temporary value

# Initialize the KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Apply KNN Imputation on Train Data
knn_train_data = pd.DataFrame(
    imputer.fit_transform(new_train_data[knn_columns]),
    columns=knn_columns,
    index=new_train_data.index
)
new_train_data[knn_columns] = knn_train_data

# Apply KNN Imputation on Test Data
knn_test_data = pd.DataFrame(
    imputer.transform(new_test_data[knn_columns]),
    columns=knn_columns,
    index=new_test_data.index
)
new_test_data[knn_columns] = knn_test_data

print("KNN Imputation applied successfully!")


# Step 9: Create Indicator Columns for Missing Data
for col in new_train_data.columns:
    if new_train_data[col].isnull().any():
        new_train_data[f'{col}_missing'] = new_train_data[col].isnull().astype(int)
        new_test_data[f'{col}_missing'] = new_test_data[col].isnull().astype(int)

# Print completion message
print("Missing values handled successfully with a refined strategy.")

'matched_score' not found in test data.
KNN Imputation applied successfully!
Missing values handled successfully with a refined strategy.


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
from sklearn.svm import SVR
import re
from ast import literal_eval

In [17]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [58]:
# class TextProcessor:
#     def __init__(self, max_features=200):
#         self.tfidf_models = {}
#         self.count_models = {}
#         self.svd_models = {}
#         self.max_features = max_features

#         # Initialize lemmatizer and stopwords
#         self.stop_words = set(stopwords.words('english'))
        
#         # Example for domain-specific abbreviations
#         self.abbreviation_dict = {
#             "etc": "et cetera",  # expanding abbreviations as an example
#             "info": "information"
#             # You can add more abbreviations or domain-specific terms here
#         }

#     def expand_abbreviations(self, text):
#         # Replace abbreviations with full form
#         for abbr, full_form in self.abbreviation_dict.items():
#             text = text.replace(abbr, full_form)
#         return text
    
#     def clean_text(self, text):
#         if pd.isna(text):
#             return ''
#         text = str(text).lower()
#         # Expand abbreviations
#         text = self.expand_abbreviations(text)
#         text = re.sub(r'[^\w\s]', ' ', text)
#         text = ' '.join([word for word in text.split() if word not in self.stop_words])
#         text = re.sub(r'\s+', ' ', text).strip()
#         return text
    
#     def process_list(self, text):
#         if pd.isna(text) or text == '':
#             return []
#         try:
#             items = literal_eval(text)
#             return [self.clean_text(item) for item in items]
#         except:
#             return [self.clean_text(item) for item in str(text).split(',')]

#     def fit_transform_text(self, texts, feature_name):
#         processed_texts = [' '.join(self.process_list(text)) if isinstance(text, str) else '' for text in texts]
        
#         # TF-IDF features
#         self.tfidf_models[feature_name] = TfidfVectorizer(
#             max_features=self.max_features,
#             ngram_range=(1, 2),
#             stop_words='english'
#         )
#         tfidf_matrix = self.tfidf_models[feature_name].fit_transform(processed_texts)
        
#         # Count features
#         self.count_models[feature_name] = CountVectorizer(
#             max_features=self.max_features//2,
#             ngram_range=(1, 2),
#             stop_words='english'
#         )
#         count_matrix = self.count_models[feature_name].fit_transform(processed_texts)
        
#         # Reduce dimensionality
#         self.svd_models[feature_name] = TruncatedSVD(n_components=50)
#         svd_matrix = self.svd_models[feature_name].fit_transform(tfidf_matrix)
        
#         return np.hstack([
#             tfidf_matrix.toarray(),
#             count_matrix.toarray(),
#             svd_matrix
#         ])

#     def transform_text(self, texts, feature_name):
#         processed_texts = [' '.join(self.process_list(text)) if isinstance(text, str) else '' for text in texts]
        
#         tfidf_matrix = self.tfidf_models[feature_name].transform(processed_texts)
#         count_matrix = self.count_models[feature_name].transform(processed_texts)
#         svd_matrix = self.svd_models[feature_name].transform(tfidf_matrix)
        
#         return np.hstack([
#             tfidf_matrix.toarray(),
#             count_matrix.toarray(),
#             svd_matrix
#         ])

# class FeatureEngineer:
#     def __init__(self):
#         self.text_processor = TextProcessor()
#         self.label_encoders = {}
#         self.scaler = StandardScaler()

#     def extract_required_experience(self, experience_required):
#         try:
#             # Match phrases like 'At least 1 year', '1 to 3 years', etc.
#             if "to" in experience_required:  # e.g., '1 to 3 years'
#                 match = re.search(r'(\d+)\s*to\s*(\d+)', experience_required)
#                 if match:
#                     min_years = int(match.group(1))
#                     # max_years = int(match.group(2))
#                     return min_years  # You can return min_years or max_years instead if needed
#             else:
#                 match = re.search(r'(\d+)\s*year', experience_required)  # e.g., 'At least 5 year(s)'
#                 if match:
#                     return int(match.group(1))
#         except Exception as e:
#             # print(f"Error extracting experience from '{experience_required}': {e}")
#             return 0
        
#     def extract_years_experience(self, row):
#         try:
#             start_years = [int(y) for y in re.findall(r'\d{4}', str(row['start_dates']))]
#             end_years = [int(y) for y in re.findall(r'\d{4}', str(row['end_dates']))]
#             if not end_years:
#                 end_years = [2024]  # Current year for ongoing positions
#             return sum(e - s for s, e in zip(start_years, end_years))
#         except:
#             return 0
    
#     def extract_education_level(self, degree):
#         if pd.isna(degree):
#             return 0
#         degree = str(degree).lower()
#         if 'phd' in degree or 'doctorate' in degree:
#             return 4
#         elif 'master' in degree:
#             return 3
#         elif 'bachelor' in degree or 'bsc' in degree or 'ba' in degree:
#             return 2
#         elif 'diploma' in degree or 'certificate' in degree:
#             return 1
#         return 0

#     def match_experience(self, df):
#         # Loop over each row in the DataFrame
#         for index, row in df.iterrows():
#             # Extract required and actual experience
#             required_experience = self.extract_required_experience(row['experiencere_requirement'])
#             actual_experience = self.extract_years_experience(row)  # Assuming this is a method in your class

#             # Update the 'experience_match' column based on comparison
#             if actual_experience >= required_experience:
#                 df.at[index, 'experience_match'] = 1  # Match
#             else:
#                 df.at[index, 'experience_match'] = 0  # Does not match

#     def transform(self, df, is_train=True):
#         feature_dict = {}
        
#         # Experience features
#         feature_dict['total_experience'] = df.apply(self.extract_years_experience, axis=1)
#         feature_dict['education_level'] = df['degree_names'].apply(self.extract_education_level)
#         feature_dict['num_companies'] = df['professional_company_names'].str.count(',').fillna(0) + 1
#         feature_dict['num_skills'] = df['skills'].str.count(',').fillna(0) + 1
#         feature_dict['has_certification'] = (~df['certification_skills'].isna()).astype(int)
        
#         # Process text features
#         text_features = ['skills', 'career_objective', 'responsibilities', 'educational_institution_name']
#         all_text_features = {}
        
#         for feature in text_features:
#             if is_train:
#                 text_matrix = self.text_processor.fit_transform_text(df[feature], feature)
#             else:
#                 text_matrix = self.text_processor.transform_text(df[feature], feature)
            
#             for i in range(text_matrix.shape[1]):
#                 all_text_features[f'{feature}_text_{i}'] = text_matrix[:, i]
        
#         # Skills matching scores
#         df['skills_required'] = df['skills_required'].fillna('')
#         df['skills'] = df['skills'].fillna('')
#         required_skills = df['skills_required'].apply(self.text_processor.process_list)
#         candidate_skills = df['skills'].apply(self.text_processor.process_list)
        
#         feature_dict['skills_match_ratio'] = [
#             len(set(req).intersection(set(cand))) / len(set(req)) if len(set(req)) > 0 else 0
#             for req, cand in zip(required_skills, candidate_skills)
#         ]

#         # required_experience = df['experience_requirement'].apply(self.extract_required_experience, axis=1)
#         # # candidate_experience = feature_dict['total_experience'].apply(self.text_processor.process_list)
#         # candidate_experience = feature_dict['total_experience'].apply(self.text_processor.process_list)
        
#         # feature_dict['experience_match_ratio'] = [
#         #     len(set(req).intersection(set(cand))) / len(set(req)) if len(set(req)) > 0 else 0
#         #     for req, cand in zip(required_experience, candidate_experience)
#         # ]

#         df['experience_match'] = 0
#         self.match_experience(df)
#         feature_dict['experience_match_ratio'] = df['experience_match']
        
#         # Convert features to DataFrame
#         feature_df = pd.DataFrame(feature_dict, index=df.index)
#         text_feature_df = pd.DataFrame(all_text_features, index=df.index)
        
#         # Scale numerical features
#         if is_train:
#             feature_df = pd.DataFrame(
#                 self.scaler.fit_transform(feature_df),
#                 columns=feature_df.columns,
#                 index=feature_df.index
#             )
#         else:
#             feature_df = pd.DataFrame(
#                 self.scaler.transform(feature_df),
#                 columns=feature_df.columns,
#                 index=feature_df.index
#             )
        
#         return pd.concat([feature_df, text_feature_df], axis=1)

# def train_model():
#     train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
#     test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')

#     fe = FeatureEngineer()

#     print("Transforming train data...")
#     train_features = fe.transform(train_df, is_train=True)
#     print("Transforming test data...")
#     test_features = fe.transform(test_df, is_train=False)

#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     cv_scores = []
#     test_preds = np.zeros(len(test_df))

#     params = {
#         'objective': 'regression_l2',
#         'metric': 'l2',
#         'num_leaves': 50, #31
#         'learning_rate': 0.01,
#         'feature_fraction': 0.8, #0.8
#         'bagging_fraction': 0.8, #0.8
#         'bagging_freq': 8, #5
#         'reg_alpha': 0.1,
#         'reg_lambda': 0.1,
#         'min_child_samples': 30, #20,
#         'max_bin': 255,
#     }

#     for fold, (train_idx, val_idx) in enumerate(kf.split(train_features)):
#         print(f"Training fold {fold + 1}")
#         X_train = train_features.iloc[train_idx]
#         y_train = train_df.iloc[train_idx]['matched_score']
#         X_val = train_features.iloc[val_idx]
#         y_val = train_df.iloc[val_idx]['matched_score']

#         train_data = lgbm.Dataset(X_train, label=y_train)
#         val_data = lgbm.Dataset(X_val, label=y_val)
        
#         model = lgbm.train(
#             params,
#             train_data,
#             num_boost_round=2000,
#             valid_sets=[train_data, val_data],
#             callbacks=[
#                 lgbm.early_stopping(stopping_rounds=100),
#                 lgbm.log_evaluation(100)
#             ]
#         )

#         val_preds = model.predict(X_val)
#         fold_score = mean_squared_error(y_val, val_preds)
#         cv_scores.append(fold_score)
        
#         test_preds += model.predict(test_features) / kf.n_splits

#     print(f"CV MSE: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")

#     submission = pd.DataFrame({
#         'ID': test_df['ID'],
#         'matched_score': test_preds
#     })
#     submission.to_csv('submission.csv', index=False)
#     print("Submission saved to submission.csv")

In [18]:
class TextProcessor:
    def __init__(self, max_features=200):
        self.tfidf_models = {}
        self.count_models = {}
        self.svd_models = {}
        self.max_features = max_features

        # Initialize lemmatizer and stopwords
        self.stop_words = set(stopwords.words('english'))
        
        # Example for domain-specific abbreviations
        self.abbreviation_dict = {
            "etc": "et cetera",  # expanding abbreviations as an example
            "info": "information"
            # You can add more abbreviations or domain-specific terms here
        }

    def expand_abbreviations(self, text):
        # Replace abbreviations with full form
        for abbr, full_form in self.abbreviation_dict.items():
            text = text.replace(abbr, full_form)
        return text
    
    def clean_text(self, text):
        if pd.isna(text):
            return ''
        text = str(text).lower()
        # Expand abbreviations
        text = self.expand_abbreviations(text)
        text = re.sub(r'[^\w\s]', ' ', text)
        text = ' '.join([word for word in text.split() if word not in self.stop_words])
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def process_list(self, text):
        if pd.isna(text) or text == '':
            return []
        try:
            items = literal_eval(text)
            return [self.clean_text(item) for item in items]
        except:
            return [self.clean_text(item) for item in str(text).split(',')]

    def fit_transform_text(self, texts, feature_name):
        processed_texts = [' '.join(self.process_list(text)) if isinstance(text, str) else '' for text in texts]
        
        # TF-IDF features
        self.tfidf_models[feature_name] = TfidfVectorizer(
            max_features=self.max_features,
            ngram_range=(1, 2),
            stop_words='english'
        )
        tfidf_matrix = self.tfidf_models[feature_name].fit_transform(processed_texts)
        
        # Count features
        self.count_models[feature_name] = CountVectorizer(
            max_features=self.max_features//2,
            ngram_range=(1, 2),
            stop_words='english'
        )
        count_matrix = self.count_models[feature_name].fit_transform(processed_texts)
        
        # Reduce dimensionality
        self.svd_models[feature_name] = TruncatedSVD(n_components=50)
        svd_matrix = self.svd_models[feature_name].fit_transform(tfidf_matrix)
        
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix
        ])

    def transform_text(self, texts, feature_name):
        processed_texts = [' '.join(self.process_list(text)) if isinstance(text, str) else '' for text in texts]
        
        tfidf_matrix = self.tfidf_models[feature_name].transform(processed_texts)
        count_matrix = self.count_models[feature_name].transform(processed_texts)
        svd_matrix = self.svd_models[feature_name].transform(tfidf_matrix)
        
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix
        ])

class FeatureEngineer:
    def __init__(self):
        self.text_processor = TextProcessor()
        self.label_encoders = {}
        self.scaler = StandardScaler()

    def extract_required_experience(self, experience_required):
        try:
            # Match phrases like 'At least 1 year', '1 to 3 years', etc.
            if "to" in experience_required:  # e.g., '1 to 3 years'
                match = re.search(r'(\d+)\s*to\s*(\d+)', experience_required)
                if match:
                    min_years = int(match.group(1))
                    # max_years = int(match.group(2))
                    return min_years  # You can return min_years or max_years instead if needed
            else:
                match = re.search(r'(\d+)\s*year', experience_required)  # e.g., 'At least 5 year(s)'
                if match:
                    return int(match.group(1))
        except Exception as e:
            # print(f"Error extracting experience from '{experience_required}': {e}")
            return 0
        
    def extract_years_experience(self, row):
        try:
            start_years = [int(y) for y in re.findall(r'\d{4}', str(row['start_dates']))]
            end_years = [int(y) for y in re.findall(r'\d{4}', str(row['end_dates']))]
            if not end_years:
                end_years = [2024]  # Current year for ongoing positions
            return sum(e - s for s, e in zip(start_years, end_years))
        except:
            return 0
    
    def extract_education_level(self, degree):
        if pd.isna(degree):
            return 0
        degree = str(degree).lower()
        if 'phd' in degree or 'doctorate' in degree:
            return 4
        elif 'master' in degree:
            return 3
        elif 'bachelor' in degree or 'bsc' in degree or 'ba' in degree:
            return 2
        elif 'diploma' in degree or 'certificate' in degree:
            return 1
        return 0

    def match_experience(self, df):
        # Loop over each row in the DataFrame
        for index, row in df.iterrows():
            # Extract required and actual experience
            required_experience = self.extract_required_experience(row['experiencere_requirement'])
            actual_experience = self.extract_years_experience(row)  # Assuming this is a method in your class

            # Update the 'experience_match' column based on comparison
            if actual_experience >= required_experience:
                df.at[index, 'experience_match'] = 1  # Match
            else:
                df.at[index, 'experience_match'] = 0  # Does not match

    def transform(self, df, is_train=True):
        feature_dict = {}
        
        # Experience features
        feature_dict['total_experience'] = df.apply(self.extract_years_experience, axis=1)
        feature_dict['education_level'] = df['degree_names'].apply(self.extract_education_level)
        feature_dict['num_companies'] = df['professional_company_names'].str.count(',').fillna(0) + 1
        feature_dict['num_skills'] = df['skills'].str.count(',').fillna(0) + 1
        feature_dict['has_certification'] = (~df['certification_skills'].isna()).astype(int)
        
        # Process text features
        text_features = ['skills', 'career_objective', 'responsibilities', 'educational_institution_name']
        all_text_features = {}
        
        for feature in text_features:
            if is_train:
                text_matrix = self.text_processor.fit_transform_text(df[feature], feature)
            else:
                text_matrix = self.text_processor.transform_text(df[feature], feature)
            
            for i in range(text_matrix.shape[1]):
                all_text_features[f'{feature}_text_{i}'] = text_matrix[:, i]
        
        # Skills matching scores
        df['skills_required'] = df['skills_required'].fillna('')
        df['skills'] = df['skills'].fillna('')
        required_skills = df['skills_required'].apply(self.text_processor.process_list)
        candidate_skills = df['skills'].apply(self.text_processor.process_list)
        
        feature_dict['skills_match_ratio'] = [
            len(set(req).intersection(set(cand))) / len(set(req)) if len(set(req)) > 0 else 0
            for req, cand in zip(required_skills, candidate_skills)
        ]

        # required_experience = df['experience_requirement'].apply(self.extract_required_experience, axis=1)
        # # candidate_experience = feature_dict['total_experience'].apply(self.text_processor.process_list)
        # candidate_experience = feature_dict['total_experience'].apply(self.text_processor.process_list)
        
        # feature_dict['experience_match_ratio'] = [
        #     len(set(req).intersection(set(cand))) / len(set(req)) if len(set(req)) > 0 else 0
        #     for req, cand in zip(required_experience, candidate_experience)
        # ]

        df['experience_match'] = 0
        self.match_experience(df)
        feature_dict['experience_match_ratio'] = df['experience_match']
        
        # Convert features to DataFrame
        feature_df = pd.DataFrame(feature_dict, index=df.index)
        text_feature_df = pd.DataFrame(all_text_features, index=df.index)
        
        # Scale numerical features
        if is_train:
            feature_df = pd.DataFrame(
                self.scaler.fit_transform(feature_df),
                columns=feature_df.columns,
                index=feature_df.index
            )
        else:
            feature_df = pd.DataFrame(
                self.scaler.transform(feature_df),
                columns=feature_df.columns,
                index=feature_df.index
            )
        
        return pd.concat([feature_df, text_feature_df], axis=1)
        
def train_model():
    train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
    test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')

    fe = FeatureEngineer()

    print("Transforming train data...")
    train_features = fe.transform(train_df, is_train=True)
    print("Transforming test data...")
    test_features = fe.transform(test_df, is_train=False)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    test_preds = np.zeros(len(test_features))

    # SVR parameters
    svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

    for fold, (train_idx, val_idx) in enumerate(kf.split(train_features)):
        print(f"Training fold {fold + 1}")

        # Split training and validation data
        X_train = train_features.iloc[train_idx]
        y_train = train_df.iloc[train_idx]['matched_score']
        X_val = train_features.iloc[val_idx]
        y_val = train_df.iloc[val_idx]['matched_score']

        # Train SVR
        svr_model.fit(X_train, y_train)

        # Predict on validation data
        val_preds = svr_model.predict(X_val)
        fold_score = mean_squared_error(y_val, val_preds)
        print(f"Fold {fold + 1} MSE: {fold_score}")
        cv_scores.append(fold_score)

        # Predict on test data and aggregate predictions
        test_preds += svr_model.predict(test_features) / kf.n_splits


    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'matched_score': test_preds
    })
    submission.to_csv('submission.csv', index=False)
    print("Submission saved to submission.csv")

In [19]:
if __name__ == "__main__":
    train_model()

Transforming train data...
Transforming test data...
Training fold 1
Fold 1 MSE: 0.011111402408248493
Training fold 2
Fold 2 MSE: 0.010748645408684539
Training fold 3
Fold 3 MSE: 0.010921822976660183
Training fold 4
Fold 4 MSE: 0.011138967756782381
Training fold 5
Fold 5 MSE: 0.010601109120486816
Submission saved to submission.csv
