### **PHASE 3: DATA PREPROCESSING**

In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder,OneHotEncoder, OrdinalEncoder, TargetEncoder



In [2]:
df = pd.read_csv("cleaned_easy_visa_data.csv")

In [3]:
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified


In [4]:
df.set_index("case_id",inplace=True)

In [5]:
df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified


#### **Checking for missing values**

In [6]:
# Check for missing values and duplicates
def check_for_missing_values_and_duplicates(df):
    print("\n1. Missing Values:")
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print(missing_values[missing_values > 0])
    else:
        print("No missing values found (as expected from EDA)")

    # 2. Check for duplicates
    print("\n2. Duplicate Rows:")
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    if duplicates > 0:
        print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

#### **Feature Engineering**

In [7]:
# 1. Getting the  current age of the company. This will be done by minusing the year of establishment from the currrent year.

Present_year = 2025

df['company_age'] = Present_year - df['yr_of_estab']

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0


In [8]:
#2. Determining the employee's wage per year. We have to first calculate the amount the earn on a daily basis by using a standardize working hours.

hours_per_year = 2080
weeks_per_year = 52
months_per_year = 12

def standardize_wage(row):
    """Converts prevailing_wage to an annual standard based on the unit_of_wage."""
    unit = row['unit_of_wage']
    wage = row['prevailing_wage']
    if unit == 'Hour':
        return wage * hours_per_year
    elif unit == 'Week':
        return wage * weeks_per_year
    elif unit == 'Month':
        return wage * months_per_year
    elif unit == 'Year':
        return wage
    else:
        return np.nan
    
# Apply the function using the pandas 'apply' method
df['wage_per_year'] = df.apply(standardize_wage, axis=1)
df.head()



Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39


**Getting the ratios**

In [9]:
# Getting the ration of wage per employee
df['wage_per_employee_ratio'] = df['wage_per_year'] / df['no_of_employees']

df.head()


Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee_ratio
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.441681
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.587749
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.019076
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03,851.367653
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.546571


In [10]:
# Getting the employee growth rate ratio which is dividing the number of employees by the age of the company.

df['employees_growth_rate_ratio'] = df['no_of_employees'] / df['company_age']

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee_ratio,employees_growth_rate_ratio
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.441681,401.5
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.587749,104.869565
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.019076,425.117647
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03,851.367653,1.059459
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.546571,54.1


In [11]:
# Determining the wage per company age ratio.
df['wage_per_age_ratio'] = df['wage_per_year'] / df['company_age']

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee_ratio,employees_growth_rate_ratio,wage_per_age_ratio
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.441681,401.5,68432.335111
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.587749,104.869565,3627.202174
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.019076,425.117647,7235.109412
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03,851.367653,1.059459,901.989514
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.546571,54.1,7495.3695


In [12]:
df['unit_of_wage'].unique()

array(['Hour', 'Year', 'Week', 'Month'], dtype=object)

#### **Encoding**

In [13]:
df.columns

Index(['continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'yr_of_estab',
       'region_of_employment', 'prevailing_wage', 'unit_of_wage',
       'full_time_position', 'case_status', 'company_age', 'wage_per_year',
       'wage_per_employee_ratio', 'employees_growth_rate_ratio',
       'wage_per_age_ratio'],
      dtype='object')

In [14]:
df['yr_of_estab'].unique()

array([2007. , 2002. , 2008. , 1932.5, 2005. , 2012. , 1994. , 1995. ,
       2004. , 1963. , 2006. , 1987. , 1991. , 2001. , 1972. , 2013. ,
       1968. , 1981. , 1997. , 2009. , 1998. , 2000. , 2010. , 1965. ,
       2011. , 1989. , 1933. , 1960. , 2003. , 1976. , 1996. , 1935. ,
       1999. , 1947. , 1939. , 1970. , 1977. , 1982. , 1943. , 1956. ,
       1974. , 1985. , 1984. , 1971. , 1969. , 1988. , 1944. , 1975. ,
       1966. , 1993. , 1992. , 1979. , 1986. , 1962. , 1954. , 1946. ,
       1950. , 2014. , 1980. , 1949. , 1961. , 1951. , 1958. , 1983. ,
       1948. , 1945. , 1978. , 1967. , 2015. , 1938. , 1973. , 1959. ,
       1990. , 1940. , 1934. , 1952. , 1953. , 2016. , 1937. , 1942. ,
       1964. , 1936. , 1941. , 1957. , 1955. ])

In [15]:
# Create year of establishment bins for proper encoding
bins = [df['yr_of_estab'].min(), 1950, 1981, 2001, 2011, df['yr_of_estab'].max()]
labels = ['Pre-1950', '1950-1980', '1981-2000', '2001-2010', 'Post-2010']
df['establishment_period'] = pd.cut(
    df['yr_of_estab'],
    bins=bins,
    labels=labels,
    include_lowest=True
)
df = df.drop('yr_of_estab', axis=1)

In [16]:
df['establishment_period'].value_counts()

establishment_period
1981-2000    9221
2001-2010    7359
Pre-1950     4100
1950-1980    3513
Post-2010    1287
Name: count, dtype: int64

In [17]:
df.shape

(25480, 16)

In [18]:
df['education_of_employee'].value_counts()

education_of_employee
Bachelor's     10234
Master's        9634
High School     3420
Doctorate       2192
Name: count, dtype: int64

In [19]:
# Ordinal Encoding

# Education of Employees
educational_mapping = {'High School': 1, 'Bachelor\'s': 2, 'Master\'s': 3, 'Doctorate': 4}

df['education_level_ordinal'] = df['education_of_employee'].map(educational_mapping)
df = df.drop('education_of_employee', axis=1)

# Establishment Year
establishment_period_mapping = {
    'Pre-1950': 5,
    '1950-1980': 4,
    '1981-2000': 3,
    '2001-2010': 2,
    'Post-2010': 1
}
df['establishment_period_ordinal'] = df['establishment_period'].map(establishment_period_mapping)
df = df.drop('establishment_period', axis=1)

In [20]:
df.head()

Unnamed: 0_level_0,continent,has_job_experience,requires_job_training,no_of_employees,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee_ratio,employees_growth_rate_ratio,wage_per_age_ratio,education_level_ordinal,establishment_period_ordinal
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
EZYV01,Asia,N,N,7227.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.441681,401.5,68432.335111,1,2
EZYV02,Asia,Y,N,2412.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.587749,104.869565,3627.202174,3,2
EZYV03,Asia,N,Y,7227.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.019076,425.117647,7235.109412,2,2
EZYV04,Asia,N,N,98.0,West,83434.03,Year,Y,Denied,92.5,83434.03,851.367653,1.059459,901.989514,2,5
EZYV05,Africa,Y,N,1082.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.546571,54.1,7495.3695,3,2
