In [25]:

import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

# Step 1: Load the dataset
file_path = '/kaggle/input/bitfest-datathon-2025/train.csv'
test_path = '/kaggle/input/bitfest-datathon-2025/test.csv'
data = pd.read_csv(file_path)
test = pd.read_csv(test_path)
# Step 2: Data Preprocessing
id_a = test.ID
# Drop unnecessary columns
data = data.drop(columns=["address", "online_links", "issue_dates", "expiry_dates"])
test = test.drop(columns=["address", "online_links", "issue_dates", "expiry_dates"])
# Drop rows where critical columns are missing
data = data.dropna(subset=["career_objective", "skills", "skills_required", "responsibilities.1", "matched_score"])
# test = data.dropna(subset=["career_objective", "skills", "skills_required", "responsibilities.1"])
# Convert matched_score to numeric
data["matched_score"] = pd.to_numeric(data["matched_score"])

# Initialize NLTK tools
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization and stopword removal
    return ' '.join(words)

# Apply text cleaning to the combined text
data["combined_text"] = (
    data["career_objective"].fillna("") + " " +
    data["skills"].fillna("") + " " +
    data["responsibilities.1"].fillna("") + " " +
    data["skills_required"].fillna("")
)
data["combined_text"] = data["combined_text"].apply(clean_text)

test["combined_text"] = (
    test["career_objective"].fillna("") + " " +
    test["skills"].fillna("") + " " +
    test["responsibilities.1"].fillna("") + " " +
    test["skills_required"].fillna("")
)
test["combined_text"] = test["combined_text"].apply(clean_text)
# Step 3: Extract Named Entities (Without Pre-trained NER)
# A simple keyword-based entity extraction (e.g., extracting skills or job titles)
# Here, we will extract specific keywords such as skills or job titles using keyword lists

# Define a list of common job titles and skills
job_titles = ['developer', 'manager', 'analyst', 'engineer', 'executive', 'designer', 'specialist']
skills = ['python', 'java', 'sql', 'machine learning', 'data analysis', 'deep learning', 'cloud', 'tensorflow', 'scikit-learn']

def extract_manual_entities(text):
    entities = []
    for word in text.split():
        if word in job_titles or word in skills:
            entities.append(word)
    return " ".join(entities)

data["manual_entities"] = data["combined_text"].apply(extract_manual_entities)
test["manual_entities"] = test["combined_text"].apply(extract_manual_entities)
# Step 4: Train-Test Split
X = data[["combined_text", "manual_entities"]]  # Combining TF-IDF features and manual entity features
y = data["matched_score"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)

# Step 5: Feature Extraction

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train["combined_text"])
X_test_tfidf = tfidf.transform(X_test["combined_text"])
test_tfidf = tfidf.transform(test["combined_text"])
# Manual entity-based features
X_train_manual = X_train["manual_entities"].apply(lambda x: len(x.split())).values.reshape(-1, 1)
X_test_manual = X_test["manual_entities"].apply(lambda x: len(x.split())).values.reshape(-1, 1)
test_manual = test["manual_entities"].apply(lambda x: len(x.split())).values.reshape(-1, 1)
# Combine TF-IDF and manual entity features
X_train_final = np.hstack([X_train_tfidf.toarray(), X_train_manual])
X_test_final = np.hstack([X_test_tfidf.toarray(), X_test_manual])
test_final = np.hstack([test_tfidf.toarray(), test_manual])
# Step 6: Word2Vec Embedding
# Word2Vec Model (Train on combined text)
sentences = [text.split() for text in X_train["combined_text"]]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


# Get average word2vec features for each text
def get_avg_word2vec(text, model, vector_size):
    words = text.split()
    feature_vec = np.zeros((vector_size,), dtype="float32")
    count = 0
    for word in words:
        if word in model.wv:
            feature_vec = np.add(feature_vec, model.wv[word])
            count += 1
    if count > 0:
        feature_vec = np.divide(feature_vec, count)
    return feature_vec

# Apply Word2Vec to each text
X_train_w2v = np.array([get_avg_word2vec(text, w2v_model, 100) for text in X_train["combined_text"]])
X_test_w2v = np.array([get_avg_word2vec(text, w2v_model, 100) for text in X_test["combined_text"]])
test_w2v = np.array([get_avg_word2vec(text, w2v_model, 100) for text in test["combined_text"]])
# Step 7: Combining Word2Vec with TF-IDF + Manual Entity Features
X_train_combined = np.hstack([X_train_final, X_train_w2v])
X_test_combined = np.hstack([X_test_final, X_test_w2v])
test_combined = np.hstack([test_final, test_w2v])
# Step 8: Model Training (without GridSearchCV)

# XGBoost Model (Using default hyperparameters)
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=500,          # Increase the number of trees
    learning_rate=0.05,        # Try a smaller learning rate
    max_depth=7,               # Limit the tree depth
    min_child_weight=10,       # Minimum sum of instance weight (hessian) in a child
    subsample=0.9,             # Subsample ratio
    colsample_bytree=0.8       # Subsample ratio for features
)

# LightGBM Model (Using default hyperparameters)
lgb_model = lgb.LGBMRegressor(
    objective='regression', 
    n_estimators=500,        # Increase the number of trees
    learning_rate=0.05,      # Try a smaller learning rate
    max_depth=7,             # Limit the tree depth
    min_child_samples=10,    # Minimum samples required to split
    num_leaves=31,           # Number of leaves in a tree
    subsample=0.9,           # Subsample ratio
    colsample_bytree=0.8     # Subsample ratio for features
)


# Fit models
xgb_model.fit(X_train_combined, y_train)
lgb_model.fit(X_train_combined, y_train)

# Step 9: Predictions and Evaluation

# Predict on the test set using XGBoost
y_pred_xgb = xgb_model.predict(X_test_combined)
y_pred_lgb = lgb_model.predict(X_test_combined)

# Evaluate performance
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

mse_lgb = mean_squared_error(y_test, y_pred_lgb)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

# Output results
print(f"XGBoost - Mean Squared Error: {mse_xgb}, Mean Absolute Error: {mae_xgb}, R2 Score: {r2_xgb}")
print(f"LightGBM - Mean Squared Error: {mse_lgb}, Mean Absolute Error: {mae_lgb}, R2 Score: {r2_lgb}")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150996
[LightGBM] [Info] Number of data points in the train set: 3087, number of used features: 5101
[LightGBM] [Info] Start training from score 0.639594
XGBoost - Mean Squared Error: 0.011916688303100107, Mean Absolute Error: 0.08758866213980102, R2 Score: -0.2268557317966664
LightGBM - Mean Squared Error: 0.011380647060314542, Mean Absolute Error: 0.09304126868199913, R2 Score: -0.17166881623223107


In [32]:
y_pred = lgb_model.predict(test_combined)


In [33]:
test.shape

(1909, 33)

In [34]:
y_pred

array([0.63973712, 0.6885425 , 0.58892333, ..., 0.59304603, 0.75073766,
       0.7241554 ])

In [40]:
y_pred

array([0.63973712, 0.6885425 , 0.58892333, ..., 0.59304603, 0.75073766,
       0.7241554 ])

In [36]:
id_a.shape

(1909,)

In [45]:
y_pred_flat = y_pred.flatten()

In [41]:
type(id_a)

pandas.core.series.Series

In [49]:
submission_labels = ["matched_score"]

sub = pd.DataFrame({
    "ID": id_a,
    "matched_score": y_pred_flat
})

In [47]:
sub.head()

Unnamed: 0,id,matched_score
0,1,0.639737
1,2,0.688542
2,3,0.588923
3,4,0.484315
4,5,0.618462
