### **PHASE 3: DATA PREPROCESSING**

In [2]:
# importing the necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder,OneHotEncoder, OrdinalEncoder, TargetEncoder
from scipy.stats import skew


In [3]:
df = pd.read_csv("cleaned_easy_visa_data.csv")

In [4]:
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified


In [5]:
df.set_index("case_id",inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified


#### **Checking for missing values**

In [7]:
# Check for missing values and duplicates
def check_for_missing_value_and_duplictes(df):
    print("\n1. Missing Values:")
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print(missing_values[missing_values > 0])
    else:
        print("No missing values found (as expected from EDA)")

    # 2. Check for duplicates
    print("\n2. Duplicate Rows:")
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    if duplicates > 0:
        print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

#### **Feature Engineering**

In [8]:
# 1. Getting the  current age of the company. This will be done by minusing the year of establishment from the currrent year.

Present_year = 2025

df['company_age'] = Present_year - df['yr_of_estab']

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0


In [9]:
#2. Determining the employee's wage per year. We have to first calculate the amount the earn on a daily basis by using a standardize working hours.

hours_per_year = 2080
weeks_per_year = 52
months_per_year = 12

def standardize_wage(row):
    """Converts prevailing_wage to an annual standard based on the unit_of_wage."""
    unit = row['unit_of_wage']
    wage = row['prevailing_wage']
    if unit == 'Hour':
        return wage * hours_per_year
    elif unit == 'Week':
        return wage * weeks_per_year
    elif unit == 'Month':
        return wage * months_per_year
    elif unit == 'Year':
        return wage
    else:
        return np.nan
    
# Apply the function using the pandas 'apply' method
df['wage_per_year'] = df.apply(standardize_wage, axis=1)
df.head()



Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39


**Getting the ratios**

In [10]:
#Getting the ratio of wage per employee
df['wage_per_employee_ratio'] = (df['wage_per_year'] / df['no_of_employees']).round(2)

df.head()


Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee_ratio
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.44
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.59
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.02
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03,851.37
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.55


In [11]:
# Getting the employee growth rate ratio which is dividing the number of employees by the age of the company.

df['employees_growth_rate_ratio'] = (df['no_of_employees'] / df['company_age']).round(2)

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee_ratio,employees_growth_rate_ratio
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.44,401.5
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.59,104.87
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.02,425.12
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03,851.37,1.06
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.55,54.1


In [12]:
# Determining the wage per company age ratio.
df['wage_per_age_ratio'] = (df['wage_per_year'] / df['company_age']).round(2)

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee_ratio,employees_growth_rate_ratio,wage_per_age_ratio
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.44,401.5,68432.34
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.59,104.87,3627.2
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.02,425.12,7235.11
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03,851.37,1.06,901.99
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.55,54.1,7495.37


In [13]:
df['unit_of_wage'].unique()

array(['Hour', 'Year', 'Week', 'Month'], dtype=object)

#### **Encoding**

In [14]:
df.columns

Index(['continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'yr_of_estab',
       'region_of_employment', 'prevailing_wage', 'unit_of_wage',
       'full_time_position', 'case_status', 'company_age', 'wage_per_year',
       'wage_per_employee_ratio', 'employees_growth_rate_ratio',
       'wage_per_age_ratio'],
      dtype='object')

In [15]:
df['yr_of_estab'].unique()

array([2007. , 2002. , 2008. , 1932.5, 2005. , 2012. , 1994. , 1995. ,
       2004. , 1963. , 2006. , 1987. , 1991. , 2001. , 1972. , 2013. ,
       1968. , 1981. , 1997. , 2009. , 1998. , 2000. , 2010. , 1965. ,
       2011. , 1989. , 1933. , 1960. , 2003. , 1976. , 1996. , 1935. ,
       1999. , 1947. , 1939. , 1970. , 1977. , 1982. , 1943. , 1956. ,
       1974. , 1985. , 1984. , 1971. , 1969. , 1988. , 1944. , 1975. ,
       1966. , 1993. , 1992. , 1979. , 1986. , 1962. , 1954. , 1946. ,
       1950. , 2014. , 1980. , 1949. , 1961. , 1951. , 1958. , 1983. ,
       1948. , 1945. , 1978. , 1967. , 2015. , 1938. , 1973. , 1959. ,
       1990. , 1940. , 1934. , 1952. , 1953. , 2016. , 1937. , 1942. ,
       1964. , 1936. , 1941. , 1957. , 1955. ])

In [16]:
# Create year of establishment bins for proper encoding
bins = [df['yr_of_estab'].min(), 1950, 1981, 2001, 2011, df['yr_of_estab'].max()]
labels = ['Pre-1950', '1950-1980', '1981-2000', '2001-2010', 'Post-2010']
df['establishment_period'] = pd.cut(
    df['yr_of_estab'],
    bins=bins,
    labels=labels,
    include_lowest=True
)
df = df.drop('yr_of_estab', axis=1)

In [17]:
df['establishment_period'].value_counts()

establishment_period
1981-2000    9221
2001-2010    7359
Pre-1950     4100
1950-1980    3513
Post-2010    1287
Name: count, dtype: int64

In [18]:
df['education_of_employee'].value_counts()

education_of_employee
Bachelor's     10234
Master's        9634
High School     3420
Doctorate       2192
Name: count, dtype: int64

In [19]:
# Encode the target for calculation: 1 for 'Certified', 0 for 'Denied'
df['case_status_encoded'] = df['case_status'].apply(lambda x: 1 if x == 'Certified' else 0)
df = df.drop('case_status', axis=1)

In [20]:
# Ordinal Encoding
# ======================== EDUCATIONAL MAPPING USING ORDINAL ENCODER=================
# Education of Employees
educational_mapping = {'High School': 1, 'Bachelor\'s': 2, 'Master\'s': 3, 'Doctorate': 4}

df['education_level_ordinal'] = df['education_of_employee'].map(educational_mapping)
df = df.drop('education_of_employee', axis=1)

# ===================== ESTABLISHMENT PERIOD MAPPING USING ORDINAL ENCODER =======================
# Establishment Year
establishment_period_mapping = {
    'Pre-1950': 5,
    '1950-1980': 4,
    '1981-2000': 3,
    '2001-2010': 2,
    'Post-2010': 1
}
df['establishment_period_ordinal'] = df['establishment_period'].map(establishment_period_mapping)
df = df.drop('establishment_period', axis=1)

#======================== CONTINENT USING ONE HOT ENCODER ====================

continent_dummies = pd.get_dummies(df['continent'], prefix='continent', drop_first=False).astype('int')
df = pd.concat([df, continent_dummies], axis=1)
df = df.drop('continent', axis=1)



# Calculate the mean certification rate per region
region_cert_rate_map = df.groupby('region_of_employment')['case_status_encoded'].mean().round(2)
# Map the rate back to the new feature column
df['region_target_encoded'] = df['region_of_employment'].map(region_cert_rate_map)
df = df.drop('region_of_employment', axis=1)

# --- 6. Binary Encoding (Simple Mapping) ---
binary_map = {'Y': 1, 'N': 0}
df['has_job_experience_encoded'] = df['has_job_experience'].map(binary_map)
df['requires_job_training_encoded'] = df['requires_job_training'].map(binary_map)
df['full_time_position_encoded'] = df['full_time_position'].map(binary_map)
df = df.drop(['has_job_experience', 'requires_job_training', 'full_time_position'], axis=1)

In [21]:
df.head()

Unnamed: 0_level_0,no_of_employees,prevailing_wage,unit_of_wage,company_age,wage_per_year,wage_per_employee_ratio,employees_growth_rate_ratio,wage_per_age_ratio,case_status_encoded,education_level_ordinal,...,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America,region_target_encoded,has_job_experience_encoded,requires_job_training_encoded,full_time_position_encoded
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EZYV01,7227.0,592.2029,Hour,18.0,1231782.032,170.44,401.5,68432.34,0,1,...,0,1,0,0,0,0,0.62,0,0,1
EZYV02,2412.0,83425.65,Year,23.0,83425.65,34.59,104.87,3627.2,1,3,...,0,1,0,0,0,0,0.63,1,0,1
EZYV03,7227.0,122996.86,Year,17.0,122996.86,17.02,425.12,7235.11,0,2,...,0,1,0,0,0,0,0.62,0,1,1
EZYV04,98.0,83434.03,Year,92.5,83434.03,851.37,1.06,901.99,0,2,...,0,1,0,0,0,0,0.62,0,0,1
EZYV05,1082.0,149907.39,Year,20.0,149907.39,138.55,54.1,7495.37,1,3,...,1,0,0,0,0,0,0.7,1,0,1


In [22]:
df.shape

(25480, 21)

In [23]:
check_for_missing_value_and_duplictes(df)


1. Missing Values:
No missing values found (as expected from EDA)

2. Duplicate Rows:
Number of duplicate rows: 0


### **Normalization using log transform**

In [24]:

def check_for_skewed_variables(df):
    print("\n3. Skewness Analysis (EDA identified right-skewed variables):")
    skewed_vars = ['prevailing_wage', 'no_of_employees']
    for var in skewed_vars:
        if var in df.columns:
            skewness = skew(df[var])
            print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'approximately normal'})")
check_for_skewed_variables(df)








3. Skewness Analysis (EDA identified right-skewed variables):
prevailing_wage: skewness = 0.547 (right-skewed)
no_of_employees: skewness = 0.959 (right-skewed)


In [25]:
# Log-transform skewed variables as recommended by EDA
print("=== LOG-TRANSFORMING SKEWED VARIABLES ===")
print("EDA identified these variables as right-skewed and recommended log transformation:")
# Variables to log-transform based on EDA findings
skewed_vars = ['prevailing_wage', 'no_of_employees']
for var in skewed_vars:
    if var in df.columns:
        # Check if variable has zero or negative values
        min_val = df[var].min()
        if min_val <= 0:
            # Use log1p for variables with zeros
            df[f'{var}_log'] = np.log1p(df[var])
            print(f"✓ {var}: Applied log1p transformation (had {min_val:.3f} minimum value)")
        else:
            # Use log for positive values only
            df[f'{var}_log'] = np.log(df[var])
            print(f"✓ {var}: Applied log transformation")
        # Check skewness before and after
        original_skew = skew(df[var])
        transformed_skew = skew(df[f'{var}_log'])
        print(f"  Original skewness: {original_skew:.3f} → Transformed skewness: {transformed_skew:.3f}")
print(f"\nDataset shape after log transformation: {df.shape}")
print("New log-transformed columns:", [col for col in df.columns if '_log' in col])


=== LOG-TRANSFORMING SKEWED VARIABLES ===
EDA identified these variables as right-skewed and recommended log transformation:
✓ prevailing_wage: Applied log transformation
  Original skewness: 0.547 → Transformed skewness: -2.142
✓ no_of_employees: Applied log1p transformation (had -26.000 minimum value)
  Original skewness: 0.959 → Transformed skewness: nan

Dataset shape after log transformation: (25480, 23)
New log-transformed columns: ['prevailing_wage_log', 'no_of_employees_log']


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [26]:
df = df[df['no_of_employees'] >= 0]


In [27]:
check_for_missing_value_and_duplictes(df)


1. Missing Values:
No missing values found (as expected from EDA)

2. Duplicate Rows:
Number of duplicate rows: 0


After log transformation, we realized we have negative log transformed rows, so in this case, we are dropping it

In [28]:
df.head()

Unnamed: 0_level_0,no_of_employees,prevailing_wage,unit_of_wage,company_age,wage_per_year,wage_per_employee_ratio,employees_growth_rate_ratio,wage_per_age_ratio,case_status_encoded,education_level_ordinal,...,continent_Europe,continent_North America,continent_Oceania,continent_South America,region_target_encoded,has_job_experience_encoded,requires_job_training_encoded,full_time_position_encoded,prevailing_wage_log,no_of_employees_log
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EZYV01,7227.0,592.2029,Hour,18.0,1231782.032,170.44,401.5,68432.34,0,1,...,0,0,0,0,0.62,0,0,1,6.383849,8.885718
EZYV02,2412.0,83425.65,Year,23.0,83425.65,34.59,104.87,3627.2,1,3,...,0,0,0,0,0.63,1,0,1,11.331711,7.788626
EZYV03,7227.0,122996.86,Year,17.0,122996.86,17.02,425.12,7235.11,0,2,...,0,0,0,0,0.62,0,1,1,11.719914,8.885718
EZYV04,98.0,83434.03,Year,92.5,83434.03,851.37,1.06,901.99,0,2,...,0,0,0,0,0.62,0,0,1,11.331812,4.59512
EZYV05,1082.0,149907.39,Year,20.0,149907.39,138.55,54.1,7495.37,1,3,...,0,0,0,0,0.7,1,0,1,11.917773,6.98749


#### **Feature Selection**

In [30]:
# Feature selection based on EDA correlation evidence
print("=== FEATURE SELECTION BASED ON EDA CORRELATION EVIDENCE ===")
# Separate features and target
X = df.drop('case_status_encoded', axis=1)
y = df['case_status_encoded']
print(f"Total features available: {X.shape[1]}")
# EDA-identified high-signal features (|correlation| > 0.2)
# There are no high-sgnal features with correlation > 0.2
# Engineered features
engineered_features = ['company_age', 'wage_per_year']
# EDA-identified low-signal features (|correlation| < 0.1)
low_signal_features = ['no_of_employees', 'prevailing_wage']
print(f"EDA-identified low-signal features: {low_signal_features}")
# Check which features are actually available
available_low_signal = [f for f in low_signal_features if f in X.columns]
print(f"Available low-signal features: {available_low_signal}")
# Create feature sets for evaluation
print("\n=== FEATURE SETS FOR EVALUATION ===")
# Set 1: All original features (excluding low-signal)
features_exclude_low_signal = [col for col in X.columns if col not in low_signal_features]
print(f"Set 1 - Exclude low-signal features: {len(features_exclude_low_signal)} features")
# Set 2: Engineered features
engineered = engineered_features + [col for col in X.columns if any(keyword in col.lower() for keyword in ['ratio', 'ordinal', 'encoded', 'continent', 'log'])]
print(f"Set 2 - Engineered: {len(engineered)} features")
# Set 3: All features (for comparison)
all_features = list(X.columns)
print(f"Set 3 - All features: {len(all_features)} features")
# Let's use Set 2 (high-signal + engineered) as recommended by EDA
selected_features = engineered
print(f"\n✓ Selected feature set: {len(selected_features)} features")
print("Selected features:", selected_features)


=== FEATURE SELECTION BASED ON EDA CORRELATION EVIDENCE ===
Total features available: 22
EDA-identified low-signal features: ['no_of_employees', 'prevailing_wage']
Available low-signal features: ['no_of_employees', 'prevailing_wage']

=== FEATURE SETS FOR EVALUATION ===
Set 1 - Exclude low-signal features: 20 features
Set 2 - Engineered: 19 features
Set 3 - All features: 22 features

✓ Selected feature set: 19 features
Selected features: ['company_age', 'wage_per_year', 'wage_per_employee_ratio', 'employees_growth_rate_ratio', 'wage_per_age_ratio', 'education_level_ordinal', 'establishment_period_ordinal', 'continent_Africa', 'continent_Asia', 'continent_Europe', 'continent_North America', 'continent_Oceania', 'continent_South America', 'region_target_encoded', 'has_job_experience_encoded', 'requires_job_training_encoded', 'full_time_position_encoded', 'prevailing_wage_log', 'no_of_employees_log']


In [31]:
df.to_csv("preprocessed_easy_visa.csv",index=False)