### Install and Import necessary packages

In [0]:
pip install ucimlrepo

Python interpreter will be restarted.
Collecting ucimlrepo
  Using cached ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Python interpreter will be restarted.


In [0]:
pip install lightgbm

Python interpreter will be restarted.
Collecting lightgbm
  Using cached lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.5.0
Python interpreter will be restarted.


In [0]:
pip install ctgan

Python interpreter will be restarted.
Collecting ctgan
  Using cached ctgan-0.10.2-py3-none-any.whl (23 kB)
Collecting rdt>=1.11.0
  Using cached rdt-1.13.2-py3-none-any.whl (66 kB)
Collecting tqdm<5,>=4.29
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting torch>=1.9.0
  Using cached torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl (906.5 MB)
Collecting Faker>=17
  Using cached Faker-33.3.1-py3-none-any.whl (1.9 MB)
Collecting triton==3.1.0
  Using cached triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
Collecting nvidia-nvtx-cu12==12.4.127
  Using cached nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (99 kB)
Collecting networkx
  Using cached networkx-3.2.1-py3-none-any.whl (1.6 MB)
Collecting nvidia-curand-cu12==10.3.5.147
  Using cached nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)
Collecting nvidia-cublas-cu12==12.4.5.8
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4

In [0]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm
from sklearn.metrics import classification_report
import ctgan



### Read the Income dataset from UCI Repository

In [0]:
  
# fetch dataset from UCI repo
adult = fetch_ucirepo(id=2) 
df = pd.DataFrame(adult.data.features)
df['income'] = pd.DataFrame(adult.data.targets)
print(df.shape)
df.sample(3)

(48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
642,21,Private,160968,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,40,United-States,<=50K
479,23,Private,197904,HS-grad,9,Never-married,Other-service,Unmarried,White,Female,0,0,35,United-States,<=50K
23650,55,Private,82098,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,55,United-States,<=50K


### Basic formatting and minor fixes to the dataset

In [0]:
df = df.drop('education',axis=1)
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df['income'] = df['income'].apply(lambda x: x.rstrip('.') if isinstance(x, str) else x)

### Define a function for binning Numerical features

In [0]:
def bin_column(df,ignore_cols,b):
    for x in df.select_dtypes(['int64','float64','double']):
        if x not in ignore_cols:
            print('Binning the ' + str(x) + ' feature')
            df[x] = pd.cut(df[x],bins=b)
            df[x] = df[x].astype(object)
        else:
            print('Skipping the ' + str(x) + ' feature')
            continue

In [0]:
bin_column(df,['education-num'],4)

Binning the age feature
Binning the fnlwgt feature
Skipping the education-num feature
Binning the capital-gain feature
Binning the capital-loss feature
Binning the hours-per-week feature


### Define a function to perform label encoding and provide the correspoding mapping for future reference

In [0]:
def encode_column(df):
    le = LabelEncoder()
    for x in df.select_dtypes('object'):
        print('Encoding the ' + str(x) + ' feature')
        le.fit(df[x])
        le_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        df[x] = le.transform(df[x])
        print(le_mapping)

In [0]:
encode_column(df)

Encoding the age feature
{Interval(16.927, 35.25, closed='right'): 0, Interval(35.25, 53.5, closed='right'): 1, Interval(53.5, 71.75, closed='right'): 2, Interval(71.75, 90.0, closed='right'): 3}
Encoding the workclass feature
{'?': 0, 'Federal-gov': 1, 'Local-gov': 2, 'Never-worked': 3, 'Private': 4, 'Self-emp-inc': 5, 'Self-emp-not-inc': 6, 'State-gov': 7, 'Without-pay': 8, nan: 9}
Encoding the fnlwgt feature
{Interval(10806.885, 381813.75, closed='right'): 0, Interval(381813.75, 751342.5, closed='right'): 1, Interval(751342.5, 1120871.25, closed='right'): 2, Interval(1120871.25, 1490400.0, closed='right'): 3}
Encoding the marital-status feature
{'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}
Encoding the occupation feature
{'?': 0, 'Adm-clerical': 1, 'Armed-Forces': 2, 'Craft-repair': 3, 'Exec-managerial': 4, 'Farming-fishing': 5, 'Handlers-cleaners': 6, 'Machine-op-inspct': 7, 'Other-serv

### Defining the Feature, Target and performing train-test split

In [0]:
x = df.drop('income',axis=1)
y = df['income']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.75,random_state=42)

### Train a LightGBM Classifier and return the feature importance

In [0]:
lgb = lightgbm.LGBMClassifier()
lgb.fit(xtrain,ytrain)
gen_feature_imp = pd.DataFrame({'Value':lgb.feature_importances_,'Feature':x.columns})
gen_feature_imp = gen_feature_imp.sort_values(by='Value',ascending=False)
gen_feature_imp

[LightGBM] [Info] Number of positive: 8724, number of negative: 27907
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124
[LightGBM] [Info] Number of data points in the train set: 36631, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.238159 -> initscore=-1.162800
[LightGBM] [Info] Start training from score -1.162800


Unnamed: 0,Value,Feature
5,523,occupation
3,502,education-num
1,305,workclass
12,279,native-country
11,250,hours-per-week
0,234,age
6,230,relationship
4,205,marital-status
10,150,capital-loss
9,102,capital-gain


### Simulate predictions for test data as scores df having both predictions as well as probability scores

In [0]:
scores = xtest.copy()
scores['income'] = ytest
scores['p_<=50k'] = lgb.predict_proba(xtest)[:,0]
scores['p_>50k'] = lgb.predict_proba(xtest)[:,1]
scores['predicted_income'] = lgb.predict(xtest)
print(scores.shape)
scores.head()

(12211, 17)


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,p_<=50k,p_>50k,predicted_income
7762,0,4,1,9,4,8,1,4,1,0,0,0,39,0,0.997192,0.002808,0
23881,0,4,0,8,4,12,3,4,0,0,0,0,39,0,0.998028,0.001972,0
30507,0,2,0,9,4,6,2,2,1,0,0,1,39,0,0.995933,0.004067,0
28911,0,4,0,10,4,12,3,4,0,0,0,1,39,0,0.991473,0.008527,0
19484,1,4,0,9,4,7,4,0,1,0,0,2,33,0,0.980556,0.019444,0


In [0]:
print(classification_report(scores['income'],scores['predicted_income']))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      9248
           1       0.72      0.56      0.63      2963

    accuracy                           0.84     12211
   macro avg       0.79      0.75      0.77     12211
weighted avg       0.83      0.84      0.83     12211



### Defining a function for cutoff based predictions based on the number of bands provided

In [0]:
def cut_off_based_prediction(df, target, num_of_bands):
    for score in range(1, num_of_bands):
        col_name = f'cutoff_{score / num_of_bands}'  # Create column name based on score
        cutoff = score / 10  # Calculate cutoff value
        # Apply condition to the entire column vectorized
        df[col_name] = (cutoff >= df[target]).astype(int)
    return df


In [0]:
scores_final = cut_off_based_prediction(scores,'p_<=50k',10)
scores_final.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,...,predicted_income,cutoff_0.1,cutoff_0.2,cutoff_0.3,cutoff_0.4,cutoff_0.5,cutoff_0.6,cutoff_0.7,cutoff_0.8,cutoff_0.9
4393,0,0,0,10,4,0,3,4,1,0,...,0,0,0,0,0,0,0,0,0,0
11499,3,4,0,4,6,8,1,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4248,1,4,0,7,4,8,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
36957,0,4,0,6,2,3,0,4,1,0,...,0,0,0,0,0,0,0,0,0,0
46677,2,1,0,14,0,10,1,4,1,0,...,0,0,0,0,0,0,1,1,1,1


In [0]:
# Check if default cutoff is 0.5 
are_equal = (scores_final['predicted_income'] == scores_final['cutoff_0.5']).all()
print(are_equal)

True


## Model Optimisation
### 1.Cutoff based Optimisation


In [0]:

print(classification_report(scores[scores['native-country'] == 19]['income'],scores[scores['native-country'] == 19]['predicted_income']))

              precision    recall  f1-score   support

           0       0.77      0.68      0.72        25
           1       0.53      0.64      0.58        14

    accuracy                           0.67        39
   macro avg       0.65      0.66      0.65        39
weighted avg       0.69      0.67      0.67        39



In [0]:

print(classification_report(scores[scores['native-country'] == 19]['income'],scores[scores['native-country'] == 19]['cutoff_0.6']))

              precision    recall  f1-score   support

           0       0.84      0.64      0.73        25
           1       0.55      0.79      0.65        14

    accuracy                           0.69        39
   macro avg       0.70      0.71      0.69        39
weighted avg       0.74      0.69      0.70        39



### 2. Building Specialised Models

In [0]:
in_data = df[df['native-country'] == 19]#India
print(in_data.shape)
in_data.head()

(151, 14)


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
11,0,7,0,13,2,10,0,1,1,0,0,1,19,1
892,0,4,0,11,2,1,5,1,0,0,0,1,19,0
968,1,4,0,13,2,4,0,1,1,0,0,1,19,1
1029,1,5,0,14,3,12,1,1,1,0,0,1,19,0
1095,0,6,0,13,4,10,3,1,1,0,0,0,19,0


In [0]:
model = ctgan.CTGAN(verbose=True,batch_size=200,epochs=1700)
categorical_columns = ['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income']
model.fit(in_data,categorical_columns)

  0%|          | 0/1700 [00:00<?, ?it/s]Gen. (0.00) | Discrim. (0.00):   0%|          | 0/1700 [00:00<?, ?it/s]Gen. (1.38) | Discrim. (0.01):   0%|          | 0/1700 [00:00<?, ?it/s]Gen. (1.35) | Discrim. (-0.03):   0%|          | 0/1700 [00:00<?, ?it/s]Gen. (1.30) | Discrim. (-0.05):   0%|          | 0/1700 [00:00<?, ?it/s]Gen. (1.30) | Discrim. (-0.05):   0%|          | 3/1700 [00:00<01:17, 21.76it/s]Gen. (1.31) | Discrim. (-0.10):   0%|          | 3/1700 [00:00<01:17, 21.76it/s]Gen. (1.25) | Discrim. (-0.12):   0%|          | 3/1700 [00:00<01:17, 21.76it/s]Gen. (1.27) | Discrim. (-0.15):   0%|          | 3/1700 [00:00<01:17, 21.76it/s]Gen. (1.27) | Discrim. (-0.15):   0%|          | 6/1700 [00:00<01:17, 21.82it/s]Gen. (1.22) | Discrim. (-0.18):   0%|          | 6/1700 [00:00<01:17, 21.82it/s]Gen. (1.22) | Discrim. (-0.23):   0%|          | 6/1700 [00:00<01:17, 21.82it/s]Gen. (1.32) | Discrim. (-0.24):   0%|          | 6/1700 [00:00<01:17, 21.82it/s]Gen. (1.32) | Discri

In [0]:
in_syn_data = model.sample(10000)
in_syn_data.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,1,4,0,15,3,10,3,1,1,2,0,0,19,0
1,1,4,0,14,2,10,1,1,0,0,0,0,19,0
2,0,5,0,16,2,12,1,1,1,0,0,2,19,1
3,0,7,0,13,4,10,1,3,1,0,1,1,19,0
4,0,4,0,10,4,14,3,1,0,0,0,0,19,0


In [0]:
in_data['income'].value_counts()

Out[52]: 0    89
1    62
Name: income, dtype: int64

In [0]:
in_syn_data['income'].value_counts()

Out[53]: 0    5744
1    4256
Name: income, dtype: int64

In [0]:
in_df = pd.concat([in_data,in_syn_data],ignore_index=True)
print(in_df.shape)
in_df.head()

(10151, 14)


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0,7,0,13,2,10,0,1,1,0,0,1,19,1
1,0,4,0,11,2,1,5,1,0,0,0,1,19,0
2,1,4,0,13,2,4,0,1,1,0,0,1,19,1
3,1,5,0,14,3,12,1,1,1,0,0,1,19,0
4,0,6,0,13,4,10,3,1,1,0,0,0,19,0


In [0]:
x = in_df.drop('income',axis=1)
y = in_df['income']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.75,random_state=42)

In [0]:
lgb = lightgbm.LGBMClassifier()
lgb.fit(xtrain,ytrain)
gen_feature_imp = pd.DataFrame({'Value':lgb.feature_importances_,'Feature':x.columns})
gen_feature_imp = gen_feature_imp.sort_values(by='Value',ascending=False)
gen_feature_imp

[LightGBM] [Info] Number of positive: 3235, number of negative: 4378
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 7613, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.424931 -> initscore=-0.302563
[LightGBM] [Info] Start training from score -0.302563


Unnamed: 0,Value,Feature
5,687,occupation
3,581,education-num
4,320,marital-status
6,311,relationship
1,280,workclass
0,219,age
11,199,hours-per-week
7,171,race
10,97,capital-loss
8,70,sex


In [0]:
scores = xtest.copy()
scores['income'] = ytest
scores['p_<=50k'] = lgb.predict_proba(xtest)[:,0]
scores['p_>50k'] = lgb.predict_proba(xtest)[:,1]
scores['predicted_income'] = lgb.predict(xtest)
print(scores.shape)
scores.head()

(2538, 17)


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,p_<=50k,p_>50k,predicted_income
621,1,4,0,16,3,4,1,1,1,0,0,1,19,0,0.652905,0.347095,0
4404,2,4,0,13,2,4,0,1,1,0,0,1,19,0,0.259771,0.740229,1
6795,0,4,1,10,2,12,0,1,1,0,0,1,19,0,0.828721,0.171279,0
4607,1,7,0,15,2,10,0,1,1,0,1,1,19,1,0.020457,0.979543,1
8165,0,7,0,10,5,7,1,3,1,0,0,0,19,0,0.987351,0.012649,0


In [0]:
print(classification_report(scores['income'],scores['predicted_income']))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82      1455
           1       0.76      0.78      0.77      1083

    accuracy                           0.80      2538
   macro avg       0.80      0.80      0.80      2538
weighted avg       0.80      0.80      0.80      2538

