In [41]:
import pandas as pd
pd.__version__

'2.2.2'

In [42]:
df=pd.read_csv("EasyVisa.csv")
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [43]:
df.shape

(25480, 12)

In [44]:
# Statistical summary
df.describe()

Unnamed: 0,no_of_employees,yr_of_estab,prevailing_wage
count,25480.0,25480.0,25480.0
mean,5667.04321,1979.409929,74455.814592
std,22877.928848,42.366929,52815.942327
min,-26.0,1800.0,2.1367
25%,1022.0,1976.0,34015.48
50%,2109.0,1997.0,70308.21
75%,3504.0,2005.0,107735.5125
max,602069.0,2016.0,319210.27


## Data Cleaning
* NUll values
* Duplicates
* Check data types
* Understand data

### Handeling Null (`nan`) values

In [45]:
# Find features with missing values
features_with_NAN = [feature for feature in df.columns if df[feature].isnull().sum() >= 1]
features_with_NAN

[]

NO Null vales found

In [46]:
# Print percentage of missing values for each feature
for feature in features_with_NAN:
    missing_percentage=np.round(df[feature].isnull().mean() * 100,5)
    print(f"{feature} : {missing_percentage} % missing")

### Duplicate values

In [47]:
df.duplicated().sum()

0

No duplicate values found

### Removing Unrelated columns

In [48]:
df.columns

Index(['case_id', 'continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'yr_of_estab',
       'region_of_employment', 'prevailing_wage', 'unit_of_wage',
       'full_time_position', 'case_status'],
      dtype='object')

removing `case_id` as it doesnt contain any meaningfull info and it is also not in out *database*

In [49]:
df.drop('case_id',inplace=True,axis=1)
df.columns

Index(['continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'yr_of_estab',
       'region_of_employment', 'prevailing_wage', 'unit_of_wage',
       'full_time_position', 'case_status'],
      dtype='object')

## Feature Engineering
### Feature Extraction

In [50]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


as `yr_of_estab` does not hold any meaningful info on its own but can be converted to `company_age` which is a meaningful feature

In [51]:
from datetime import date
present_year=date.today().year
present_year

2024

In [52]:
df['company_age']=present_year-df['yr_of_estab']
# removing 'yr_of_estab'
df.drop('yr_of_estab',inplace=True,axis=1)
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age
0,Asia,High School,N,N,14513,West,592.2029,Hour,Y,Denied,17
1,Asia,Master's,Y,N,2412,Northeast,83425.65,Year,Y,Certified,22
2,Asia,Bachelor's,N,Y,44444,West,122996.86,Year,Y,Denied,16
3,Asia,Bachelor's,N,N,98,West,83434.03,Year,Y,Denied,127
4,Africa,Master's,Y,N,1082,South,149907.39,Year,Y,Certified,19


## Analysing features

### Numerical Features

**Note:** `O`stands for `object` data type, which is typically used for strings or categorical data in pandas. It essentially means `non-numerical` data.

In [53]:
num_features=[feature for feature in df.columns if df[feature].dtype !='O']
print(f"Numerical feture Columns : {num_features}")
print(f"Number of Numnerical features:  {len(num_features)}")

Numerical feture Columns : ['no_of_employees', 'prevailing_wage', 'company_age']
Number of Numnerical features:  3


### Categorical Features

In [54]:
cat_features=[feature for feature in df.columns if df[feature].dtype == 'O']
print(f"Categorical Feature Columns: {cat_features}")
print(f"Number of Categorical Feature Columns: {len(cat_features)}")

Categorical Feature Columns: ['continent', 'education_of_employee', 'has_job_experience', 'requires_job_training', 'region_of_employment', 'unit_of_wage', 'full_time_position', 'case_status']
Number of Categorical Feature Columns: 8


### Discrete Features
Discrete features can only take specific, distinct values. These are typically counted and cannot take fractional values.

**Examples:**

* Number of children (e.g., 0, 1, 2, 3)
* Number of cars (e.g., 1, 2, 3)
* Shoe size (e.g., 7, 8, 9)
  
**Characteristics:**

* Values are typically represented with integers (whole numbers).
* Discrete features have gaps between values (e.g., you can’t have 1.5 children).
* The range of values can be finite or infinite, but the values are distinct.


In [58]:
disc_features=[feature for feature in num_features if len(df[feature].unique())<=25]
disc_features

[]

## Continious features

Continuous features can take any value within a given range, including decimal (fractional) values. These are typically measured and can take an infinite number of values.

**Examples:**

* Height (e.g., 5.72 feet)
* Weight (e.g., 72.5 kg)
* Temperature (e.g., 98.6°F)

**Characteristics:**

* Values are often represented with floating-point numbers.
* Continuous features have a range (min to max) but can theoretically take any value within that range.
* In theory, there are no gaps between values.

In [60]:
contin_features=[feature for feature in num_features if feature not in disc_features]
print(f"Continious Feature Columns: {contin_features}")
print(f"Number of Continious Feature Columns: {len(contin_features)}")

Continious Feature Columns: ['no_of_employees', 'prevailing_wage', 'company_age']
Number of Continious Feature Columns: 3


**NOTE:**
  
* **Continuous features** are usually treated with *scaling* techniques (e.g., Min-Max scaling, Standardization).
* **Discrete features** might be treated as either *categorical* or *ordinal* depending on their context or binned into categories.

## Split DataFrame into Inputs and Outputs
* X = Inputs
* Y = Outputs
  

In [61]:
df.columns

Index(['continent', 'education_of_employee', 'has_job_experience',
       'requires_job_training', 'no_of_employees', 'region_of_employment',
       'prevailing_wage', 'unit_of_wage', 'full_time_position', 'case_status',
       'company_age'],
      dtype='object')

In [62]:
X=df.drop('case_status',axis=1)
Y=df['case_status']

In [63]:
# Viewing inputs
X.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,company_age
0,Asia,High School,N,N,14513,West,592.2029,Hour,Y,17
1,Asia,Master's,Y,N,2412,Northeast,83425.65,Year,Y,22
2,Asia,Bachelor's,N,Y,44444,West,122996.86,Year,Y,16
3,Asia,Bachelor's,N,N,98,West,83434.03,Year,Y,127
4,Africa,Master's,Y,N,1082,South,149907.39,Year,Y,19


In [64]:
# Outputs
Y.head()

0       Denied
1    Certified
2       Denied
3       Denied
4    Certified
Name: case_status, dtype: object