In [19]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier 
import xgboost as xgb
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("train.csv")

In [3]:
print(train_df.shape)

(13842, 16)


In [4]:
train_df.head()

Unnamed: 0,id,Age,Working_class,fnlwgt,education,education_num,marital_status,Occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,earning
0,0,37,Private,280966,Bachelors,13,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,0
1,1,41,Private,205153,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,2,23,Private,237720,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,38,United-States,1
3,3,35,Private,276153,Bachelors,13,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Female,4650,0,40,United-States,1
4,4,28,Private,216178,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,1


In [5]:
train_df.columns

Index(['id', 'Age', 'Working_class', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'Occupation', 'relationship', 'race', 'gender',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'earning'],
      dtype='object')

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['Encoded_Working_class'] = le.fit_transform(train_df['Working_class'])
train_df['Encoded_education'] = le.fit_transform(train_df['education'])
train_df['Encoded_race'] = le.fit_transform(train_df['race'])
train_df['Encoded_gender'] = le.fit_transform(train_df['gender'])
train_df['Encoded_native_country'] = le.fit_transform(train_df['native_country'])
train_df['Encoded_Occupation'] = le.fit_transform(train_df['Occupation'])
    
train_df.drop(['Working_class'], axis = 1, inplace=True)
train_df.drop(['education'], axis = 1, inplace=True)
train_df.drop(['race'], axis = 1, inplace=True)
train_df.drop(['gender'], axis = 1, inplace=True)
train_df.drop(['native_country'], axis = 1, inplace=True)
train_df.drop(['Occupation'], axis = 1, inplace=True)

train_df.head()

Unnamed: 0,id,Age,fnlwgt,education_num,marital_status,relationship,capital_gain,capital_loss,hours_per_week,earning,Encoded_Working_class,Encoded_education,Encoded_race,Encoded_gender,Encoded_native_country,Encoded_Occupation
0,0,37,280966,13,Married-civ-spouse,Husband,0,0,40,0,3,9,4,1,38,6
1,1,41,205153,11,Married-civ-spouse,Husband,0,0,40,1,3,8,4,1,38,4
2,2,23,237720,13,Never-married,Not-in-family,0,0,38,1,3,9,4,1,38,10
3,3,35,276153,13,Never-married,Not-in-family,4650,0,40,1,3,9,1,0,38,13
4,4,28,216178,9,Married-civ-spouse,Husband,0,0,40,1,3,11,4,1,38,12


### Dropping all the columns that contains text

In [7]:
drop_col = ['id', 'marital_status', 'relationship']

In [8]:
train_df.drop(drop_col, axis=1, inplace=True)

In [9]:
train_df.head()

Unnamed: 0,Age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,earning,Encoded_Working_class,Encoded_education,Encoded_race,Encoded_gender,Encoded_native_country,Encoded_Occupation
0,37,280966,13,0,0,40,0,3,9,4,1,38,6
1,41,205153,11,0,0,40,1,3,8,4,1,38,4
2,23,237720,13,0,0,38,1,3,9,4,1,38,10
3,35,276153,13,4650,0,40,1,3,9,1,0,38,13
4,28,216178,9,0,0,40,1,3,11,4,1,38,12


### Performing the Train Test Split

In [10]:
X = train_df.drop(['earning'], axis=1)
y = train_df['earning']

In [11]:
scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.45205479, -0.60615789,  0.6       , ...,  1.        ,
         0.9       , -0.14285714],
       [-0.34246575, -0.71837021,  0.33333333, ...,  1.        ,
         0.9       , -0.42857143],
       [-0.83561644, -0.67016715,  0.6       , ...,  1.        ,
         0.9       ,  0.42857143],
       ...,
       [-0.8630137 , -0.93481849,  0.2       , ..., -1.        ,
         0.9       , -0.85714286],
       [-0.28767123, -0.5150654 ,  0.6       , ..., -1.        ,
         0.9       , -0.28571429],
       [-0.17808219, -0.69144831,  0.06666667, ..., -1.        ,
         0.9       , -0.85714286]])

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size = 0.1)

In [13]:
print(y_train.value_counts())

1    7212
0    5245
Name: earning, dtype: int64


In [14]:
y_val.value_counts()

1    787
0    598
Name: earning, dtype: int64

### Training

In [15]:
# clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=8, random_state=0)

# clf_gini.fit(X_train, y_train)

# clf_gini.score(X_train, y_train)

In [16]:
# rfc = RandomForestClassifier(n_estimators = 100, random_state = 10)

# rfc.fit(X_train, y_train)

# rfc.score(X_train, y_train)

In [17]:
# gbcl = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)
# gbcl.fit(X_train, y_train)

# gbcl.score(X_train, y_train)

0.7994701774102914

In [15]:
classifier = xgb.XGBRegressor()
regressor = xgb.XGBRegressor()

In [16]:
booster=['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]

In [17]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [21]:
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [22]:
random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          validate_parameters=None,
                                          verbosity=None),
                   n_iter=50, n_jobs=4,
                   param

In [24]:
random_cv.best_estimator_

XGBRegressor(base_score=0.25, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [33]:
regressor=xgb.XGBRegressor(base_score=0.25, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=1, n_estimators=900,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
       silent=True, subsample=1)

In [34]:
regressor.fit(X_train,y_train)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.25, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=1,
             monotone_constraints='()', n_estimators=900, n_jobs=1, nthread=1,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, silent=True, subsample=1, tree_method='exact', ...)

In [18]:
# feature_scores = pd.Series(rfc.feature_importances_, index=X_train.columns).sort_values(ascending=False)

# feature_scores

In [19]:
# lr = LogisticRegression(random_state = 0, max_iter = 1000)
# lr.fit(X_train, y_train)
# clf_gini.score(X_val, y_val)
# rfc.score(X_val, y_val)
# gbcl.score(X_val, y_val)


0.792057761732852

In [35]:
y_pred_xgb = regressor.predict(X_val)

In [None]:
# score = f1_score(y_val, y_pred_xgb)

In [None]:
# print(score)

In [None]:
# from sklearn.metrics import accuracy_score

# print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_val, y_pred_xgb)))

### Making Prediction

In [24]:
test_data = pd.read_csv("test.csv")

In [25]:
test_data1 = test_data.copy()

In [26]:
test_data.shape

(13840, 15)

In [27]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
test_data['Encoded_Working_class'] = le.fit_transform(test_data['Working_class'])
test_data['Encoded_education'] = le.fit_transform(test_data['education'])
test_data['Encoded_race'] = le.fit_transform(test_data['race'])
test_data['Encoded_gender'] = le.fit_transform(test_data['gender'])
test_data['Encoded_native_country'] = le.fit_transform(test_data['native_country'])
test_data['Encoded_Occupation'] = le.fit_transform(test_data['Occupation'])

    
test_data.drop(['Working_class'], axis = 1, inplace=True)
test_data.drop(['education'], axis = 1, inplace=True)
test_data.drop(['race'], axis = 1, inplace=True)
test_data.drop(['gender'], axis = 1, inplace=True)
test_data.drop(['native_country'], axis = 1, inplace=True)
test_data.drop(['Occupation'], axis = 1, inplace=True)

test_data.head()

Unnamed: 0,id,Age,fnlwgt,education_num,marital_status,relationship,capital_gain,capital_loss,hours_per_week,Encoded_Working_class,Encoded_education,Encoded_race,Encoded_gender,Encoded_native_country,Encoded_Occupation
0,0,34,174789,9,Married-civ-spouse,Husband,0,0,45,3,11,4,1,38,12
1,1,38,181943,13,Never-married,Not-in-family,0,0,35,3,9,4,0,38,1
2,2,45,175625,9,Separated,Unmarried,0,0,38,3,11,4,0,38,1
3,3,20,121023,10,Never-married,Own-child,0,0,15,3,15,4,0,38,1
4,4,41,81054,10,Divorced,Unmarried,0,0,25,2,15,4,0,38,4


In [28]:
test_data.drop(drop_col, axis=1, inplace=True)

In [29]:
# y_test = lr.predict(test_data)

In [30]:
scaler1 = MinMaxScaler(feature_range=(-1, 1))
test_data_scaled = scaler1.fit_transform(test_data)

In [31]:
y_pred_xgb = xgb.predict(test_data_scaled)

### Submission File

In [32]:
result = pd.DataFrame(test_data1['id'], columns = ['id'])
result['earning']= y_pred_xgb

In [33]:
result.to_csv("submission1.csv", index=False)