# Data Pre-Processing

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

### Importing Data

In [2]:
df=pd.read_csv("EasyVisa.csv")

In [3]:
df.shape

(25480, 12)

In [4]:
df.columns

Index(['case_id', 'continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'yr_of_estab',
       'region_of_employment', 'prevailing_wage', 'unit_of_wage',
       'full_time_position', 'case_status'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


## Data Cleaning
* Missing or Null values
* Duplicate Values
* Check Data types
* Understand Dataset

### Missing or Null values

In [8]:
features_with_na=[feature for feature in df.columns if df[feature].isnull().sum()>0]
features_with_na

[]

In [13]:
# Displaying the percentage of missing values for each feature
for feature in features_with_na:
    missing_percentage = np.round(df[feature].isnull().mean() * 100, 5)
    print(f"{feature}: {missing_percentage}% missing values")

**Observation:** No null values in the dataset

### Duplicate Values

In [9]:
df.duplicated().sum()

0

**Observation:** No duplicate values in the dataset

### Data Types

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_id                25480 non-null  object 
 1   continent              25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  object 
 4   requires_job_training  25480 non-null  object 
 5   no_of_employees        25480 non-null  int64  
 6   yr_of_estab            25480 non-null  int64  
 7   region_of_employment   25480 non-null  object 
 8   prevailing_wage        25480 non-null  float64
 9   unit_of_wage           25480 non-null  object 
 10  full_time_position     25480 non-null  object 
 11  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


## Feature Engineering

In [12]:
df.columns

Index(['case_id', 'continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'yr_of_estab',
       'region_of_employment', 'prevailing_wage', 'unit_of_wage',
       'full_time_position', 'case_status'],
      dtype='object')

We are dropping the `case_id` column from the dataset because it is a unique identifier that does not provide any meaningful information for our analysis or model training. Unique identifiers like this don't contribute to predicting the target variable and could introduce noise into the model.

In [16]:
# Dropping 'case_id' as it does not provide any meaningful information for the analysis
df=df.drop('case_id',axis=1)
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [18]:
df.columns

Index(['continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'yr_of_estab',
       'region_of_employment', 'prevailing_wage', 'unit_of_wage',
       'full_time_position', 'case_status'],
      dtype='object')

The `yr_of_estab` (year of establishment) column represents the year a company was established, which is not directly useful for analysis. To make it more meaningful, we will convert it into `company_age` by subtracting the year of establishment from the current year. This transformation will give us the age of the company, which is more relevant for analysis and modeling.

In [24]:
from datetime import date

date_now=date.today()
current_year=date_now.year
current_year

2024

calculating the company age

In [25]:
df['company_age']=current_year-df['yr_of_estab']

In [27]:
df=df.drop('yr_of_estab',axis=1)
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age
0,Asia,High School,N,N,14513,West,592.2029,Hour,Y,Denied,17
1,Asia,Master's,Y,N,2412,Northeast,83425.65,Year,Y,Certified,22
2,Asia,Bachelor's,N,Y,44444,West,122996.86,Year,Y,Denied,16
3,Asia,Bachelor's,N,N,98,West,83434.03,Year,Y,Denied,127
4,Africa,Master's,Y,N,1082,South,149907.39,Year,Y,Certified,19


## Analysing features

### Numerical Features
**Note:** `O`stands for `object` data type, which is typically used for strings or categorical data in pandas. It essentially means `non-numerical` data.



In [28]:
# Identifying numerical features (i.e., columns that are not of object type)
num_features = [col for col in df.columns if df[col].dtype != 'O']

# Displaying the numerical feature columns and their count
print(f"Numerical Feature Columns: {num_features}")
print(f"Number of Numerical Features: {len(num_features)}")


Numerical Feature Columns: ['no_of_employees', 'prevailing_wage', 'company_age']
Number of Numerical Features: 3


### Categorical Features

In [29]:
# Identifying categorical features (i.e., columns with object data type)
cat_features = [col for col in df.columns if df[col].dtype == 'O']

# Displaying the categorical feature columns and their count
print(f"Categorical Feature Columns: {cat_features}")
print(f"Number of Categorical Features: {len(cat_features)}")


Categorical Feature Columns: ['continent', 'education_of_employee', 'has_job_experience', 'requires_job_training', 'region_of_employment', 'unit_of_wage', 'full_time_position', 'case_status']
Number of Categorical Features: 8


### Discrete Features
Discrete features can only take specific, distinct values. These are typically counted and cannot take fractional values.

**Examples:**

* Number of children (e.g., 0, 1, 2, 3)
* Number of cars (e.g., 1, 2, 3)
* Shoe size (e.g., 7, 8, 9)
  
**Characteristics:**

* Values are typically represented with integers (whole numbers).
* Discrete features have gaps between values (e.g., you can’t have 1.5 children).
* The range of values can be finite or infinite, but the values are distinct.

In [30]:
# Identifying discrete numerical features (i.e., numerical features with 25 or fewer unique values)
disc_features = [col for col in num_features if df[col].nunique() <= 25]

# Displaying the discrete numerical feature columns
disc_features


[]

### Continious features

Continuous features can take any value within a given range, including decimal (fractional) values. These are typically measured and can take an infinite number of values.

**Examples:**

* Height (e.g., 5.72 feet)
* Weight (e.g., 72.5 kg)
* Temperature (e.g., 98.6°F)

**Characteristics:**

* Values are often represented with floating-point numbers.
* Continuous features have a range (min to max) but can theoretically take any value within that range.
* In theory, there are no gaps between values.

In [31]:
# Identifying continuous numerical features (i.e., numerical features that are not discrete)
continuous_features = [col for col in num_features if col not in disc_features]

# Displaying the number of continuous numerical features
print('Number of Continuous Features:', len(continuous_features))


Number of Continuous Features: 3


## Split DataFrame into Inputs and Outputs
In this step, we split the DataFrame into two variables:

* **X:** This variable contains the independent columns (features) that will be used to make predictions.
* **y:** This variable contains the dependent column, specifically the case_status column, which we aim to predict.

Before splitting, we encode the case_status values to numerical format, where:

* `denied` is encoded as 0
* `certified` is encoded as 1

This encoding is necessary because machine learning models cannot interpret string labels directly; they require numerical representations to perform calculations effectively.

In [32]:
df.columns

Index(['continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'region_of_employment',
       'prevailing_wage', 'unit_of_wage', 'full_time_position', 'case_status',
       'company_age'],
      dtype='object')

In [33]:
X=df.drop('case_status',axis=1)
Y=df['case_status']

In [34]:
X.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,company_age
0,Asia,High School,N,N,14513,West,592.2029,Hour,Y,17
1,Asia,Master's,Y,N,2412,Northeast,83425.65,Year,Y,22
2,Asia,Bachelor's,N,Y,44444,West,122996.86,Year,Y,16
3,Asia,Bachelor's,N,N,98,West,83434.03,Year,Y,127
4,Africa,Master's,Y,N,1082,South,149907.39,Year,Y,19


In [35]:
Y.head()

Unnamed: 0,case_status
0,Denied
1,Certified
2,Denied
3,Denied
4,Certified


In [39]:
# Encoding target column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Fit and transform the 'case_status' column and create a new column in the DataFrame
Y = label_encoder.fit_transform(df['case_status'])

# Verifying the encoding
Y[:5]


array([1, 0, 1, 1, 0])