In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the data
file_path = 'job_market.csv'
df = pd.read_csv(file_path)

# Preprocess the data
def clean_salary(salary):
    salary = re.sub(r'₹|,|a year|per month|per annum|lakh|crore', '', salary)
    salary = salary.strip()
    salary_range = re.findall(r'\d+', salary)
    if len(salary_range) == 2:
        return (float(salary_range[0]) + float(salary_range[1])) / 2
    elif len(salary_range) == 1:
        return float(salary_range[0])
    else:
        return np.nan

df['Salary'] = df['Salary'].apply(lambda x: clean_salary(str(x)))
df['Salary'] = df['Salary'].fillna(df['Salary'].median())

# Encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.drop('Salary', axis=1)
y = df_encoded['Salary']

# Impute missing values
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = []

# Perform K-Fold Cross-Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

print(f'Mean Squared Error for each fold: {mse_scores}')
print(f'Average Mean Squared Error: {np.mean(mse_scores)}')

Mean Squared Error for each fold: [545507072.7630258, 862933885.2569441, 2132933531.0026405, 7228505847.717662, 24940473921.085182]
Average Mean Squared Error: 7142070851.565091
