In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import json
import joblib
# Preprocessing libraries
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
# Statistical libraries
from scipy import stats
from scipy.stats import zscore, skew

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
url =("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv")
Easy_Visa = pd.read_csv(url)
df_processed = Easy_Visa.copy()

In [5]:


def preprocessing(df_processe, target_col='case_status'):
  
    
    print("DATA QUALITY ANALYSIS REPORT ")
    print(f"Dataset shape: {df_processe.shape}")
    
    #  Missing Values
    print("\n. Missing Values:")
    missing_values = df_processe.isnull().sum()
    if missing_values.sum() > 0:
        print(missing_values[missing_values > 0])
    else:
        print(" No missing values found.")
    
    #  Duplicate Rows
    print("\n. Duplicate Rows:")
    duplicates = df_processe.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    if duplicates > 0:
        print(f"Percentage of duplicates: {(duplicates/len(df_processe))*100:.2f}%")
    else:
        print(" No duplicate rows found.")
    
    #  Skewness for Numerical Variables
    print("\n. Skewness Analysis (for numerical features):")
    num_cols = df_processe.select_dtypes(include=['int64', 'float64']).columns
    if len(num_cols) > 0:
        for col in num_cols:
            sk = skew(df_processe[col].dropna())
            label = "right-skewed" if sk > 0.5 else "approximately normal"
            print(f"{col}: skewness = {sk:.3f} ({label})")
    else:
        print("No numerical columns found.")
    
    #  Correlation Among Numerical Features
    print("\n. Correlation Analysis:")
    if len(num_cols) > 1:
        corr_matrix = df_processe[num_cols].corr(numeric_only=True)
        print(corr_matrix)
        
        # Optional: Correlation with target column if numeric
        if target_col in corr_matrix.columns:
            correlations = corr_matrix[target_col].sort_values(key=abs, ascending=False)
            print(f"\nTop correlated features with '{target_col}':")
            print(correlations[1:6])  # exclude target itself
    else:
        print("Not enough numerical columns for correlation analysis.")
    
    print("\n Data quality analysis completed successfully.")
    return None


In [6]:
df_processed = preprocessing(Easy_Visa)

DATA QUALITY ANALYSIS REPORT 
Dataset shape: (25480, 12)

. Missing Values:
 No missing values found.

. Duplicate Rows:
Number of duplicate rows: 0
 No duplicate rows found.

. Skewness Analysis (for numerical features):
no_of_employees: skewness = 12.265 (right-skewed)
yr_of_estab: skewness = -2.037 (approximately normal)
prevailing_wage: skewness = 0.756 (right-skewed)

. Correlation Analysis:
                 no_of_employees  yr_of_estab  prevailing_wage
no_of_employees         1.000000    -0.017770        -0.009523
yr_of_estab            -0.017770     1.000000         0.012342
prevailing_wage        -0.009523     0.012342         1.000000

 Data quality analysis completed successfully.


In [7]:
numerical_features = Easy_Visa.select_dtypes(include = ['int', 'float']).columns.tolist()
print(f"numerical features:\n {numerical_features}")

numerical features:
 ['no_of_employees', 'yr_of_estab', 'prevailing_wage']


In [8]:
categorical_features = Easy_Visa.select_dtypes(include = ['object']). columns.tolist()
print(f"categorical features:\n {categorical_features}")

categorical features:
 ['case_id', 'continent', 'education_of_employee', 'has_job_experience', 'requires_job_training', 'region_of_employment', 'unit_of_wage', 'full_time_position', 'case_status']


In [9]:

def preprocess_employee_data(df):
    """
    Preprocess the employee dataset by encoding categorical variables.
    - Binary categorical columns: Label encoded using custom mappings
    - Multi-category columns: One-hot encoded
    """
    # Label Encoding (binary columns)
    label_map = {
        'has_job_experience': {'Y': 1, 'N': 0},
        'requires_job_training': {'Y': 1, 'N': 0},
        'full_time_position': {'Y': 1, 'N': 0},
        'case_status': {'Certified': 1, 'Denied': 0}
    }
    for col, mapping in label_map.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)
    # One-hot Encoding (multi-category columns)
    onehot_cols = [
        'continent',
        'education_of_employee',
        'region_of_employment',
        'unit_of_wage'
    ]
    df = pd.get_dummies(df, columns=onehot_cols, drop_first=False, dtype=int)
    print("\nPreprocessing complete.")
    return df

In [10]:
df_processed = preprocess_employee_data(Easy_Visa)
print(df_processed.head())


Preprocessing complete.
  case_id  has_job_experience  requires_job_training  no_of_employees  \
0  EZYV01                   0                      0            14513   
1  EZYV02                   1                      0             2412   
2  EZYV03                   0                      1            44444   
3  EZYV04                   0                      0               98   
4  EZYV05                   1                      0             1082   

   yr_of_estab  prevailing_wage  full_time_position  case_status  \
0         2007         592.2029                   1            0   
1         2002       83425.6500                   1            1   
2         2008      122996.8600                   1            0   
3         1897       83434.0300                   1            0   
4         2005      149907.3900                   1            1   

   continent_Africa  continent_Asia  ...  education_of_employee_Master's  \
0                 0               1  ...           

In [11]:


def apply_log_transformation(df, skewed_vars):
    """
    Applies log or log1p transformation to reduce skewness in given numerical columns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame containing the features.
    skewed_vars : list
        List of column names to apply log transformation on.
    
    Returns:
    --------
    df : pd.DataFrame
        Updated DataFrame with new log-transformed columns.
    """
    
    print("=== LOG TRANSFORMATION REPORT ===")
    
    for var in skewed_vars:
        if var in df.columns:
            min_val = df[var].min()
            
            # Handle zero or negative values safely
            if min_val <= 0:
                df[f'{var}_log'] = np.log1p(df[var])
                print(f" {var}: Applied log1p transformation (min={min_val:.3f})")
            else:
                df[f'{var}_log'] = np.log(df[var])
                print(f" {var}: Applied natural log transformation")
            
            # Compare skewness before and after
            original_skew = skew(df[var].dropna())
            transformed_skew = skew(df[f'{var}_log'].dropna())
            print(f"   Original skewness: {original_skew:.3f} â†’ Transformed skewness: {transformed_skew:.3f}\n")
        else:
            print(f" Column '{var}' not found in DataFrame.\n")
    
    print(f"Final dataset shape: {df.shape}")
    print("New log-transformed columns:", [col for col in df.columns if '_log' in col])
    print("=== Transformation Complete ===\n")
    
    return df


In [None]:
df_processed =  apply_log_transformation(df_processed, 

TypeError: apply_log_transformation() missing 1 required positional argument: 'skewed_vars'

In [None]:
# Outlier treatment based on EDA recommendations
print("OUTLIER TREATMENT (IQR-CAPPING METHOD)")
print("EDA recommended IQR-capping for extreme acidity/sulphates to preserve data points")

# Define numerical columns (excluding target)
numerical_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
if 'Case_Status' in numerical_cols:
    numerical_cols.remove('Case_Status')

    print(f"Treating outliers in {len(numerical_cols)} numerical features")

# Apply IQR-capping method
outliers_capped = 0
for col in numerical_cols:
    Q1 = df_processed[col].quantile(0.25)
    Q3 = df_processed[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers before capping
    outliers_before = ((df_processed[col] < lower_bound) | (df_processed[col] > upper_bound)).sum()
    
    if outliers_before > 0:
        # Cap outliers
        df_processed[col] = np.where(df_processed[col] < lower_bound, lower_bound, df_processed[col])
        df_processed[col] = np.where(df_processed[col] > upper_bound, upper_bound, df_processed[col])
        outliers_capped += outliers_before
        print(f" {col}: Capped {outliers_before} outliers")

print(f"\nTotal outliers capped: {outliers_capped}")
print(f"Dataset shape after outlier treatment: {df_processed.shape}")

OUTLIER TREATMENT (IQR-CAPPING METHOD)
EDA recommended IQR-capping for extreme acidity/sulphates to preserve data points


NameError: name 'df_processed' is not defined