In [None]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier 
import warnings

warnings.filterwarnings('ignore')

train_df = pd.read_csv("train.csv")

In [3]:
print(train_df.shape)

(13842, 16)


In [4]:
train_df.head()

Unnamed: 0,id,Age,Working_class,fnlwgt,education,education_num,marital_status,Occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,earning
0,0,37,Private,280966,Bachelors,13,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,0
1,1,41,Private,205153,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,2,23,Private,237720,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,38,United-States,1
3,3,35,Private,276153,Bachelors,13,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Female,4650,0,40,United-States,1
4,4,28,Private,216178,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,1


In [5]:
train_df.columns

Index(['id', 'Age', 'Working_class', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'Occupation', 'relationship', 'race', 'gender',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'earning'],
      dtype='object')

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['Encoded_Working_class'] = le.fit_transform(train_df['Working_class'])
train_df['Encoded_education'] = le.fit_transform(train_df['education'])
train_df['Encoded_race'] = le.fit_transform(train_df['race'])
train_df['Encoded_gender'] = le.fit_transform(train_df['gender'])
train_df['Encoded_native_country'] = le.fit_transform(train_df['native_country'])
train_df['Encoded_Occupation'] = le.fit_transform(train_df['Occupation'])
train_df['Encoded_marital_status'] = le.fit_transform(train_df['marital_status'])
train_df['Encoded_relationship'] = le.fit_transform(train_df['relationship'])
    
train_df.drop(['Working_class'], axis = 1, inplace=True)
train_df.drop(['education'], axis = 1, inplace=True)
train_df.drop(['race'], axis = 1, inplace=True)
train_df.drop(['gender'], axis = 1, inplace=True)
train_df.drop(['native_country'], axis = 1, inplace=True)
train_df.drop(['Occupation'], axis = 1, inplace=True)
train_df.drop(['marital_status'], axis = 1, inplace=True)
train_df.drop(['relationship'], axis = 1, inplace=True)

train_df.head()

Unnamed: 0,id,Age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,earning,Encoded_Working_class,Encoded_education,Encoded_race,Encoded_gender,Encoded_native_country,Encoded_Occupation,Encoded_marital_status,Encoded_relationship
0,0,37,280966,13,0,0,40,0,3,9,4,1,38,6,2,0
1,1,41,205153,11,0,0,40,1,3,8,4,1,38,4,2,0
2,2,23,237720,13,0,0,38,1,3,9,4,1,38,10,4,1
3,3,35,276153,13,4650,0,40,1,3,9,1,0,38,13,4,1
4,4,28,216178,9,0,0,40,1,3,11,4,1,38,12,2,0


### Dropping columns

In [7]:
drop_col = ['id','fnlwgt']

In [8]:
train_df.drop(drop_col, axis=1, inplace=True)

In [9]:
train_df.head()

Unnamed: 0,Age,education_num,capital_gain,capital_loss,hours_per_week,earning,Encoded_Working_class,Encoded_education,Encoded_race,Encoded_gender,Encoded_native_country,Encoded_Occupation,Encoded_marital_status,Encoded_relationship
0,37,13,0,0,40,0,3,9,4,1,38,6,2,0
1,41,11,0,0,40,1,3,8,4,1,38,4,2,0
2,23,13,0,0,38,1,3,9,4,1,38,10,4,1
3,35,13,4650,0,40,1,3,9,1,0,38,13,4,1
4,28,9,0,0,40,1,3,11,4,1,38,12,2,0


In [10]:
# train_df.capital_loss.value_counts()

### Performing the Train Test Split

In [11]:
X = train_df.drop(['earning'], axis=1)
y = train_df['earning']

In [12]:
scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.45205479,  0.6       , -1.        , ..., -0.14285714,
        -0.33333333, -1.        ],
       [-0.34246575,  0.33333333, -1.        , ..., -0.42857143,
        -0.33333333, -1.        ],
       [-0.83561644,  0.6       , -1.        , ...,  0.42857143,
         0.33333333, -0.6       ],
       ...,
       [-0.8630137 ,  0.2       , -1.        , ..., -0.85714286,
         0.33333333, -0.6       ],
       [-0.28767123,  0.6       , -1.        , ..., -0.28571429,
        -0.33333333,  1.        ],
       [-0.17808219,  0.06666667, -1.        , ..., -0.85714286,
        -1.        ,  0.6       ]])

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size = 0.1)

In [14]:
print(y_train.value_counts())

1    7204
0    5253
Name: earning, dtype: int64


In [15]:
y_val.value_counts()

1    795
0    590
Name: earning, dtype: int64

### Training

In [16]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score

# clf = SVC(kernel='linear')
# clf.fit(X_train,y_train)

# y_pred = clf.predict(X_val)


In [17]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)

In [18]:
# import xgboost as xgb

# clf = xgb.XGBClassifier(learning_rate=0.001, max_depth=10, n_estimators=100)

# clf.fit(X_train, y_train)

In [19]:
# clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=8, random_state=0)

# clf_gini.fit(X_train, y_train)

# clf_gini.score(X_train, y_train)

In [20]:
# clf = RandomForestClassifier(n_estimators = 100, random_state = 10)

# clf.fit(X_train, y_train)

# clf.score(X_train, y_train)
          
# y_pred = clf.predict(X_val)

In [21]:
# gbcl = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)
# gbcl.fit(X_train, y_train)

# gbcl.score(X_train, y_train)

In [22]:
clf.score(X_train, y_train)

0.8359155494902465

In [23]:
# lr = LogisticRegression(random_state = 0, max_iter = 1000)
# lr.fit(X_train, y_train)
# clf_gini.score(X_val, y_val)
# rfc.score(X_val, y_val)
# gbcl.score(X_val, y_val)
# classifier.score(X_val, y_val)
clf.score(X_val, y_val)

0.8303249097472925

In [24]:
# clf_gini.score(X_val, y_val)

In [25]:
# y_pred = clf_gini.predict(X_val)

In [26]:
score = f1_score(y_val, y_pred)

print(score)

0.8543087414755115


In [27]:
from sklearn.metrics import accuracy_score

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score : 0.8303


In [38]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_train, y_train, cv=10, scoring = 'accuracy').mean()

0.8290101658640985

### Making Prediction

In [28]:
test_data = pd.read_csv("test.csv")

In [29]:
test_data1 = test_data.copy()

In [30]:
test_data.shape

(13840, 15)

In [31]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
test_data['Encoded_Working_class'] = le.fit_transform(test_data['Working_class'])
test_data['Encoded_education'] = le.fit_transform(test_data['education'])
test_data['Encoded_race'] = le.fit_transform(test_data['race'])
test_data['Encoded_gender'] = le.fit_transform(test_data['gender'])
test_data['Encoded_native_country'] = le.fit_transform(test_data['native_country'])
test_data['Encoded_Occupation'] = le.fit_transform(test_data['Occupation'])
test_data['Encoded_marital_status'] = le.fit_transform(test_data['marital_status'])
test_data['Encoded_relationship'] = le.fit_transform(test_data['relationship'])

    
test_data.drop(['Working_class'], axis = 1, inplace=True)
test_data.drop(['education'], axis = 1, inplace=True)
test_data.drop(['race'], axis = 1, inplace=True)
test_data.drop(['gender'], axis = 1, inplace=True)
test_data.drop(['native_country'], axis = 1, inplace=True)
test_data.drop(['Occupation'], axis = 1, inplace=True)
test_data.drop(['marital_status'], axis = 1, inplace=True)
test_data.drop(['relationship'], axis = 1, inplace=True)

test_data.head()

Unnamed: 0,id,Age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,Encoded_Working_class,Encoded_education,Encoded_race,Encoded_gender,Encoded_native_country,Encoded_Occupation,Encoded_marital_status,Encoded_relationship
0,0,34,174789,9,0,0,45,3,11,4,1,38,12,2,0
1,1,38,181943,13,0,0,35,3,9,4,0,38,1,4,1
2,2,45,175625,9,0,0,38,3,11,4,0,38,1,5,4
3,3,20,121023,10,0,0,15,3,15,4,0,38,1,4,3
4,4,41,81054,10,0,0,25,2,15,4,0,38,4,0,4


In [32]:
test_data.drop(drop_col, axis=1, inplace=True)

In [33]:
scaler1 = MinMaxScaler(feature_range=(-1, 1))
test_data_scaled = scaler1.fit_transform(test_data)
test_data_scaled

array([[-0.53424658,  0.06666667, -1.        , ...,  0.71428571,
        -0.33333333, -1.        ],
       [-0.42465753,  0.6       , -1.        , ..., -0.85714286,
         0.33333333, -0.6       ],
       [-0.23287671,  0.06666667, -1.        , ..., -0.85714286,
         0.66666667,  0.6       ],
       ...,
       [-0.23287671,  0.33333333, -1.        , ...,  0.57142857,
        -0.33333333, -1.        ],
       [-0.5890411 ,  0.73333333, -1.        , ...,  0.42857143,
        -0.33333333, -1.        ],
       [ 0.09589041,  0.6       , -1.        , ..., -0.42857143,
         0.33333333, -0.6       ]])

In [34]:
y_test = clf.predict(test_data_scaled)
y_test.sum()

8203

### Submission File

In [35]:
result = pd.DataFrame(test_data1['id'], columns = ['id'])
result['earning'] = y_test

In [36]:
result.to_csv("submission3.csv", index=False)