In [9]:
# main.py
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV


In [10]:
data=pd.read_csv(r"R:\Internship #job  IT\IBM edunet Foundation\adult 3.csv")
data.head(15)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [11]:
data.shape

(48842, 15)

In [12]:
data.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [13]:
print(data.workclass.value_counts())

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64


In [14]:

data.workclass.replace({'?':'Others'},inplace=True)
print('Workclass')
print(data['workclass'].value_counts())

Workclass
workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
Others               2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.workclass.replace({'?':'Others'},inplace=True)


In [15]:
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,Others,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [16]:
print(data['workclass'].value_counts())

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
Others               2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64


In [17]:
# in the above code 'without-pay and 'never-worked' having very less count as well as they are not earning
# so not required for income prediction
# so we can remove such data
data=data[data['workclass']!='Without-pay']
data=data[data['workclass']!='Never-worked']

In [18]:
print(data['workclass'].value_counts())

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
Others               2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Name: count, dtype: int64


In [19]:
data.education.value_counts()

education
HS-grad         15768
Some-college    10873
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1809
Assoc-acdm       1599
10th             1387
7th-8th           952
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64

In [20]:
# here also we can eliminate some categories like '1st-4th','5th-6th','Preschool'
data=data[data['education']!='1st-4th']
data=data[data['education']!='5th-6th']
data=data[data['education']!='Preschool']

In [21]:
data.education.value_counts()

education
HS-grad         15768
Some-college    10873
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1809
Assoc-acdm       1599
10th             1387
7th-8th           952
Prof-school       834
9th               756
12th              657
Doctorate         594
Name: count, dtype: int64

In [22]:
data.shape

(47972, 15)

In [23]:
# This checks if the column exists before trying to drop it
if 'education' in data.columns:
    data.drop(columns=['education'], inplace=True)

In [24]:
data

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,Others,103497,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [25]:
data.occupation.replace({'?':'Notlisted'},inplace=True)
print('occupation')
print(data['occupation'].value_counts())

occupation
occupation
Prof-specialty       6165
Exec-managerial      6072
Craft-repair         6006
Adm-clerical         5591
Sales                5476
Other-service        4746
Machine-op-inspct    2877
Notlisted            2730
Transport-moving     2302
Handlers-cleaners    1980
Tech-support         1445
Farming-fishing      1380
Protective-serv       981
Priv-house-serv       206
Armed-Forces           15
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.occupation.replace({'?':'Notlisted'},inplace=True)


In [26]:
data.shape

(47972, 14)

In [27]:
data

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,Others,103497,10,Never-married,Notlisted,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [28]:
# spliting data in to X(independent variable) and Y (dependent variable)
X=data.drop(columns=['income'])
Y=data['income']
X

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,Others,103497,10,Never-married,Notlisted,Own-child,White,Female,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
48838,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
48839,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
48840,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [29]:
# as we know the algorithm is a mathematical expression so it cannot work on text data
# so we have to convert text data collumns in to numerical for that we will use encoder

In [31]:
# 1. Separate features (X) and target (Y)
X = data.drop('income', axis=1)
Y = data['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

In [32]:
Y

0        0
1        0
2        1
3        1
4        0
        ..
48837    0
48838    1
48839    0
48840    0
48841    1
Name: income, Length: 47972, dtype: int64

In [33]:
# 2. Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=np.number).columns

In [34]:
# 3. Create a preprocessor using a ColumnTransformer
# This is a more robust method than applying encoders one by one.
# It uses OneHotEncoder for categorical data, which is better than LabelEncoder here.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


In [35]:
# 4. Split the data for training and testing
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)


In [36]:
print("Preprocessing pipeline created and data is split.")

Preprocessing pipeline created and data is split.


In [37]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=23,stratify=Y)
# here random_state=True suggest that the records getting choosen for trainning
# and testing will be in random order
# statify=Y we use only for classification task
# statify maintans the catagorical ratio in output column(Y) while getting choosen for trainning and testing

In [38]:
# 1. Create the full pipeline
# This pipeline first runs the preprocessor, then the classifier.
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', GradientBoostingClassifier(random_state=42))])

In [39]:
# 2. Define the parameter grid to search
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 4]
}

In [40]:
# 3. Set up and run the GridSearchCV
# This will find the best combination of parameters from your grid.
print("--- Starting GridSearchCV to find the best model... (This may take a few minutes) ---")
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(xtrain, ytrain)


--- Starting GridSearchCV to find the best model... (This may take a few minutes) ---


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__learning_rate': [0.05, 0.1], 'classifier__max_depth': [3, 4], 'classifier__n_estimators': [100, 200]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


In [41]:
# 4. Evaluate the best model found
print("\n--- Evaluation ---")
print(f"Best parameters found: {grid_search.best_params_}")



--- Evaluation ---
Best parameters found: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 4, 'classifier__n_estimators': 200}


In [42]:
best_model = grid_search.best_estimator_
ypred = best_model.predict(xtest)
final_accuracy = accuracy_score(ytest, ypred)


In [43]:
print(f"\nFinal Accuracy on the test set: {final_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(ytest, ypred))


Final Accuracy on the test set: 0.8719

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      7265
           1       0.80      0.64      0.71      2330

    accuracy                           0.87      9595
   macro avg       0.84      0.79      0.81      9595
weighted avg       0.87      0.87      0.87      9595



In [44]:
# 5. Save the final, tuned model
print("\n--- Saving the model ---")
joblib.dump(best_model,'final_salary_prediction_model.pkl')
print("Model saved successfully as 'final_salary_prediction_model.pkl'")


--- Saving the model ---
Model saved successfully as 'final_salary_prediction_model.pkl'


In [45]:
model = joblib.load('final_salary_prediction_model.pkl')

In [57]:
#1st sample for Prediction the employee salary 

In [46]:
employee_data = {
    'age': 45,
    'workclass': 'Private',
    'fnlwgt': 200000,
    'educational-num': 14, # 14 = Masters
    'marital-status': 'Married-civ-spouse',
    'occupation': 'Prof-specialty',
    'relationship': 'Husband',
    'race': 'White',
    'gender': 'Male',
    'capital-gain': 0,
    'capital-loss': 0,
    'hours-per-week': 50,
    'native-country': 'United-States'}

In [49]:
new_data_df = pd.DataFrame([employee_data])
print("\n--- Predicting for the following data: ---")
print(new_data_df)



--- Predicting for the following data: ---
   age workclass  fnlwgt  educational-num      marital-status      occupation  \
0   45   Private  200000               14  Married-civ-spouse  Prof-specialty   

  relationship   race gender  capital-gain  capital-loss  hours-per-week  \
0      Husband  White   Male             0             0              50   

  native-country  
0  United-States  


In [54]:
prediction = model.predict(new_data_df)
prediction_probability = model.predict_proba(new_data_df)

In [55]:
print("\n--- Prediction Result ---")
if prediction[0] == 1:
    print("Predicted Salary: >50K")
else:
    print("Predicted Salary: <=50K")


--- Prediction Result ---
Predicted Salary: >50K


In [56]:
#2nd sample for Prediction the employee salary 

In [55]:
new_employee_data = {
    'age': 28,
    'workclass': 'Local-gov',
    'fnlwgt': 336951,
    'educational-num': 12, # 12 = Masters
    'marital-status': 'Married-civ-spouse',
    'occupation': 'Protective-serv',
    'relationship': 'Husband',
    'race': 'White',
    'gender': 'Male',
    'capital-gain': 0,
    'capital-loss': 0,
    'hours-per-week': 40,
    'native-country': 'United-States'}

In [56]:
new_data_df = pd.DataFrame([new_employee_data])
print("\n--- Predicting for the following data: ---")
print(new_data_df)



--- Predicting for the following data: ---
   age  workclass  fnlwgt  educational-num      marital-status  \
0   28  Local-gov  336951               12  Married-civ-spouse   

        occupation relationship   race gender  capital-gain  capital-loss  \
0  Protective-serv      Husband  White   Male             0             0   

   hours-per-week native-country  
0              40  United-States  


In [57]:
prediction = model.predict(new_data_df)
prediction_probability = model.predict_proba(new_data_df)

In [59]:
print("\n--- Prediction Result ---")
if prediction[0] == 1:
    print("Predicted Salary: >50K")
else:
    print("Predicted Salary: <=50K")


--- Prediction Result ---
Predicted Salary: <=50K
