# 1. Model & dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load dataset & clear redundant whitespaces
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y']

adult_train = pd.read_csv('Census/adult.data', header = None)
adult_test  = pd.read_csv('Census/adult.test', header = None)
adult_train.columns = cols
adult_test.columns = cols

for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

adult_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Global most common substitution

numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
adult_test[['capital_loss', 'capital_gain']] = numeric_imputer.fit_transform(adult_test[['capital_loss', 'capital_gain']])
adult_train[['capital_loss', 'capital_gain']] = numeric_imputer.fit_transform(adult_train[['capital_loss', 'capital_gain']])

categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', 'native_country']])
adult_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', 'native_country']])

In [5]:
# 4.2 categorical to numerical
cols_to_transform = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
test, train = pd.get_dummies(adult_test, columns=cols_to_transform), pd.get_dummies(adult_train, columns=cols_to_transform)

# transform y
train['y']= train.y.eq('>50K').mul(1)
test['y']= test.y.eq('>50K.').mul(1)

# add missing column to test dataset
test['native_country_Holand-Netherlands'] = 0

In [6]:
# 1.1
compare_clf = LogisticRegression(C=10, solver='lbfgs', max_iter = 1000)

x_train, y_train = train.loc[:, train.columns != 'y'], train['y']
x_test, y_test = test.loc[:, test.columns != 'y'], test['y']

compare_clf.fit(x_train, y_train)
orig_predicted = compare_clf.predict(x_test)

orig_train_score = compare_clf.score(x_train, y_train)
orig_test_score = accuracy_score(y_test, orig_predicted)
orig_cv_scores = cross_val_score(LogisticRegression(C=10, solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=20)
orig_train_score, orig_test_score, orig_cv_scores.mean()

(0.7728263873959645, 0.7754437688102697, 0.7799837335414785)

## 2. Misclassification noise.

In [7]:
# 2.1
n = [0.01, 0.05, 0.1, 0.2]

In [8]:
orig_col = train.y
misclf_results = {} # dict of results: keys - n(percentage of noise), values - tuple of train,test,cv scores
later_need_y = []

for perc in n:
#     invert y
    update_col = orig_col.to_frame().replace({0: 1, 1: 0})
#     insert 1-n % of NA in update column (so that 1-n% were not updated)
    update_col.loc[update_col.sample(frac=1-perc).index, 'y'] = np.nan
#     update n% of 'y' col in train
    train.update(update_col)
    
    y_train = train.y
    
    if perc == 0.1: # save that y for 5th task
        later_need_y = train.y
    
#     2.2
    clf = LogisticRegression(C=10, solver='lbfgs', max_iter = 1000)

    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)

    train_score = clf.score(x_train, y_train)
    test_score = accuracy_score(y_test, predicted)
    cv_scores = cross_val_score(LogisticRegression(C=10, solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=20)
    
    misclf_results[perc] = (train_score, test_score, cv_scores.mean())
    
train['y'] = orig_col

pd.DataFrame(data=misclf_results)

Unnamed: 0,0.01,0.05,0.1,0.2
0,0.766039,0.737784,0.680385,0.592457
1,0.775997,0.773355,0.765923,0.768687
2,0.776198,0.754659,0.714754,0.650036


In [9]:
# 2.3 show safe percentage of noise in the target col using cv_score
safe_fracts = []
for perc in n:
    if orig_cv_scores.mean() - misclf_results[perc][1] <= 0.01:
        safe_fracts.append(perc)
safe_fracts

[0.01, 0.05]

In [10]:
misclf_comp_table = pd.DataFrame(data=misclf_results)
misclf_comp_table['original'] = np.array([orig_train_score, orig_test_score, orig_cv_scores.mean()])
misclf_comp_table

Unnamed: 0,0.01,0.05,0.1,0.2,original
0,0.766039,0.737784,0.680385,0.592457,0.772826
1,0.775997,0.773355,0.765923,0.768687,0.775444
2,0.776198,0.754659,0.714754,0.650036,0.779984


## 3. Attribute noise.

In [11]:
orig_age = adult_train['age']
orig_edu_num = adult_train['education_num']
orig_race = adult_train['race']

from copy import deepcopy
orig_train = deepcopy(adult_train)
orig_test = deepcopy(adult_test)
train, test = deepcopy(adult_train), deepcopy(adult_test)

attr_results = {} # dict of results: keys - n(percentage of noise), values - tuple of train,test,cv scores

for perc in n:
    
#     3.1 Randomly negate n% of the values of the age attribute
    update_col = - train.age.to_frame()
    update_col.loc[update_col.sample(frac=1-perc).index, 'age'] = np.nan
    train.update(update_col)
    
#     3.2 Randomly replace n% of the values of education_num attribute with random large numbers in range [20,100]
    update_col = pd.DataFrame(np.random.randint(20,100,size=(train['education_num'].shape[0], 1)))
    update_col.loc[update_col.sample(frac=1-perc).index, 'education_num'] = np.nan
    train.update(update_col)
    
#     3.3 Randomly replace n% of the values of this attribue with any other random race from the set of existing races
    train.loc[train.sample(frac=perc).index, 'race'] = pd.DataFrame(np.random.choice(adult_train['race'].unique(), size=(train['education_num'].size)))
    
#     3.4 
    test, train = pd.get_dummies(test, columns=cols_to_transform), pd.get_dummies(train, columns=cols_to_transform)
    train['y']= train.y.eq('>50K').mul(1)
    test['y']= test.y.eq('>50K.').mul(1)

    clf = LogisticRegression(C=10, solver='lbfgs', max_iter = 1000)
    x_train, y_train = train.loc[:, train.columns != 'y'], train['y']
    
    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)

    train_score = clf.score(x_train, y_train)
    test_score = accuracy_score(y_test, predicted)
    cv_scores = cross_val_score(LogisticRegression(C=10, solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=20)
    
    attr_results[perc] = (train_score, test_score, cv_scores.mean())
    
    train = orig_train
    test = orig_test

pd.DataFrame(data=attr_results)

Unnamed: 0,0.01,0.05,0.1,0.2
0,0.805442,0.806578,0.771045,0.771506
1,0.805847,0.809533,0.776119,0.775935
2,0.778142,0.7815,0.781337,0.780907


In [12]:
# 3.5
differences = {} # keys - percentage(n), values accuracy differences with original on train, test and cv
for perc in n:
    train_diff = orig_train_score - attr_results[perc][0]
    test_diff = orig_test_score - attr_results[perc][1]
    cv_diff = orig_cv_scores.mean() - attr_results[perc][2]
    differences[perc] = (train_diff, test_diff, cv_diff)
pd.DataFrame(data=differences)

Unnamed: 0,0.01,0.05,0.1,0.2
0,-0.032616,-0.033752,0.001781,0.001321
1,-0.030404,-0.034089,-0.000676,-0.000491
2,0.001841,-0.001516,-0.001353,-0.000923


As we can see sometimes the model performs even better with noise in data. That can be because the corrupted columns have no influence on 'y', or because the data could be already with noise, or by some different reason attribute noise got model to train better.

## 4. Impact comparison.

In [13]:
# 4.1. Build a table to compare accuracy of the model on the original dataset with models based on datasets
# with different types and levels of noise introduced.
misclf_comp_table = pd.DataFrame(data=misclf_results)
misclf_comp_table['original'] = np.array([orig_train_score, orig_test_score, orig_cv_scores.mean()])
misclf_comp_table

Unnamed: 0,0.01,0.05,0.1,0.2,original
0,0.766039,0.737784,0.680385,0.592457,0.772826
1,0.775997,0.773355,0.765923,0.768687,0.775444
2,0.776198,0.754659,0.714754,0.650036,0.779984


In [14]:
attr_comp_table = pd.DataFrame(data=attr_results)
attr_comp_table['original'] = np.array([orig_train_score, orig_test_score, orig_cv_scores.mean()])
attr_comp_table

Unnamed: 0,0.01,0.05,0.1,0.2,original
0,0.805442,0.806578,0.771045,0.771506,0.772826
1,0.805847,0.809533,0.776119,0.775935,0.775444
2,0.778142,0.7815,0.781337,0.780907,0.779984


4.2. 

Class noise influences more, because if we have a mistake in a target - the whole row is a mistake, but if we have just an attribute mistake - the model can still use other info in a row with that mistake.

4.3

Depends on the data - we should analyse it and then decide which noise is worse.
Usually misclassification noise seems to be more of a problem, because it influences target variable, which is usually most valuable in a dataset.

But it also could be that we have very valuable attributes corrupted in data and they have big negative impact on our model, so we should address this noise first in that case.

## 5. Misclassification noise elimination.

In [15]:
# 5.1
test, train = pd.get_dummies(adult_test, columns=cols_to_transform), pd.get_dummies(adult_train, columns=cols_to_transform)
train.y = later_need_y
test['y']= test.y.eq('>50K.').mul(1)
test['native_country_Holand-Netherlands'] = 0

In [16]:
# 5.2
# split dataset to Г=5 datasets
part_size = int(len(train)*0.2)
parts = [train[:part_size], train[part_size : 2*part_size], train[2*part_size : 3*part_size], train[3*part_size : 4*part_size], train[4*part_size:]]

In [17]:
#key - index of parts of data in parts[], value - list where first item is real y, and the rest - predictions
predicts = {} 

for part_num in range(len(parts)):
    predicts[part_num] = [ np.array(parts[part_num]['y'].tolist()) ]
    
for part_num in range(len(parts)):
    x_part, y_part = parts[part_num].loc[:, parts[part_num].columns != 'y'], parts[part_num]['y']
    tree = DecisionTreeClassifier().fit(x_part, y_part)
    for pred_part_num in range(len(parts)):
        if pred_part_num != part_num:
            predicts[pred_part_num].append(tree.predict(parts[pred_part_num].loc[:, parts[pred_part_num].columns != 'y']))
    
predicts

{0: [array([0., 0., 1., ..., 1., 0., 1.]),
  array([1., 1., 0., ..., 1., 1., 1.]),
  array([1., 1., 0., ..., 0., 1., 0.]),
  array([0., 1., 1., ..., 0., 1., 0.]),
  array([0., 1., 0., ..., 1., 1., 0.])],
 1: [array([1., 1., 0., ..., 0., 0., 0.]),
  array([0., 1., 0., ..., 0., 1., 1.]),
  array([1., 0., 1., ..., 1., 0., 0.]),
  array([1., 1., 1., ..., 0., 1., 0.]),
  array([0., 0., 0., ..., 0., 0., 0.])],
 2: [array([0., 1., 1., ..., 0., 1., 0.]),
  array([0., 1., 1., ..., 1., 0., 1.]),
  array([0., 1., 0., ..., 0., 0., 1.]),
  array([1., 0., 0., ..., 0., 0., 0.]),
  array([1., 0., 1., ..., 0., 0., 0.])],
 3: [array([0., 1., 1., ..., 0., 0., 0.]),
  array([1., 0., 0., ..., 1., 1., 1.]),
  array([0., 0., 0., ..., 1., 1., 0.]),
  array([1., 0., 0., ..., 1., 0., 0.]),
  array([1., 1., 0., ..., 1., 0., 1.])],
 4: [array([0., 0., 1., ..., 1., 1., 1.]),
  array([1., 0., 1., ..., 0., 1., 1.]),
  array([1., 1., 0., ..., 1., 0., 0.]),
  array([0., 0., 0., ..., 1., 0., 1.]),
  array([1., 0., 1., 

In [18]:
# create a list of votes, where vote is a label which majority of classifiers predicted
# if classifiers 'voted' 2:2 we also count a 'vote' of a real 'y'
voting = []
for npart in range(len(parts)):
    for nrow in range(parts[npart].shape[0]):
#         having sum>=3  means we have 3+(majority) predictions of 1 so we append 1 to votes, <3 - otherwise
#         we treat real 'y' as another prediction as without it we would have a possibility of even number of votes
        voting.append( int(np.sum(np.array(predicts[npart])[:,nrow]) >= 3) )
len(voting)

32561

In [19]:
# since we have binary classification case, we can use votes list as the column iself
# (it is the same as updating a column where misclassification occured)
train.y = pd.Series(voting)

compare_clf = LogisticRegression(C=10, solver='lbfgs', max_iter = 1000)

x_train, y_train = train.loc[:, train.columns != 'y'], train['y']
x_test, y_test = test.loc[:, test.columns != 'y'], test['y']

compare_clf.fit(x_train, y_train)
CVCF_predicted = compare_clf.predict(x_test)

CVCF_train_score = compare_clf.score(x_train, y_train)
CVCF_test_score = accuracy_score(y_test, CVCF_predicted)
CVCF_cv_scores = cross_val_score(LogisticRegression(C=10, solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=20)
CVCF_train_score, CVCF_test_score, CVCF_cv_scores.mean()

CVCF_results = {}
CVCF_results['CVCF_DesTrees'] = (CVCF_train_score, CVCF_test_score, CVCF_cv_scores.mean())
CVCF_results['CVCF_DesTrees']

(0.6574122416387703, 0.7686874270622198, 0.6970851265459903)

### Failure(
The accuracy is way worse than original, because untuned DecisionTreeClassifier() gives around 52% of accuracy. Now, we __try__ the same with __supposedly tuned__ xgboost.

In [20]:
# renewing data
test, train = pd.get_dummies(adult_test, columns=cols_to_transform), pd.get_dummies(adult_train, columns=cols_to_transform)
train.y = later_need_y
test['y']= test.y.eq('>50K.').mul(1)
test['native_country_Holand-Netherlands'] = 0

In [21]:
from xgboost import XGBClassifier

predicts = {} #key - index of parts of data in parts[], value - list

for part_num in range(len(parts)):
    predicts[part_num] = [ np.array(parts[part_num]['y'].tolist()) ]
    
for part_num in range(len(parts)):
    x_part, y_part = parts[part_num].loc[:, parts[part_num].columns != 'y'], parts[part_num]['y']
    tree = XGBClassifier(max_depth=6, learning_rate=0.03, n_estimators=100, n_jobs=8, gamma=0.01, min_child_weight=5, max_delta_step=4, subsample=0.3, colsample_bytree=0.7, reg_lambda=2, scale_pos_weight=1, base_score=0.5, random_state=0).fit(x_part, y_part)
    for pred_part_num in range(len(parts)):
        if pred_part_num != part_num:
            predicts[pred_part_num].append(tree.predict(parts[pred_part_num].loc[:, parts[pred_part_num].columns != 'y']))
    
predicts

{0: [array([0., 0., 1., ..., 1., 0., 1.]),
  array([0., 0., 0., ..., 0., 0., 0.]),
  array([0., 1., 0., ..., 0., 0., 0.]),
  array([0., 1., 0., ..., 0., 0., 0.]),
  array([0., 1., 0., ..., 0., 0., 0.])],
 1: [array([1., 1., 0., ..., 0., 0., 0.]),
  array([0., 0., 0., ..., 0., 0., 0.]),
  array([0., 0., 1., ..., 0., 0., 0.]),
  array([1., 0., 1., ..., 0., 0., 0.]),
  array([0., 1., 0., ..., 0., 0., 0.])],
 2: [array([0., 1., 1., ..., 0., 1., 0.]),
  array([0., 0., 1., ..., 0., 0., 0.]),
  array([0., 0., 0., ..., 0., 0., 1.]),
  array([0., 0., 0., ..., 0., 0., 0.]),
  array([0., 0., 1., ..., 0., 0., 0.])],
 3: [array([0., 1., 1., ..., 0., 0., 0.]),
  array([1., 0., 0., ..., 0., 0., 0.]),
  array([0., 0., 0., ..., 0., 0., 0.]),
  array([0., 0., 0., ..., 0., 0., 1.]),
  array([0., 0., 0., ..., 0., 0., 0.])],
 4: [array([0., 0., 1., ..., 1., 1., 1.]),
  array([0., 0., 1., ..., 0., 0., 1.]),
  array([0., 0., 0., ..., 0., 0., 1.]),
  array([0., 0., 0., ..., 0., 0., 1.]),
  array([0., 0., 0., 

In [22]:
voting = []
for npart in range(len(parts)):
    for nrow in range(parts[npart].shape[0]):
        voting.append( int(np.sum(np.array(predicts[npart])[:,nrow]) >= 3) )
len(voting)

32561

In [23]:
train.y = pd.Series(voting)

compare_clf = LogisticRegression(C=10, solver='lbfgs', max_iter = 1000)

x_train, y_train = train.loc[:, train.columns != 'y'], train['y']
x_test, y_test = test.loc[:, test.columns != 'y'], test['y']

compare_clf.fit(x_train, y_train)
CVCF_predicted = compare_clf.predict(x_test)

CVCF_train_score = compare_clf.score(x_train, y_train)
CVCF_test_score = accuracy_score(y_test, CVCF_predicted)
CVCF_cv_scores = cross_val_score(LogisticRegression(C=10, solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=20)

CVCF_results['CVCF_xgb'] = (CVCF_train_score, CVCF_test_score, CVCF_cv_scores.mean())
CVCF_results['CVCF_xgb']

(0.8760787445103038, 0.8009336035870033, 0.8186778477204155)

#### Yay! Success)

#### 5.3
What percent of mislabled records you fixed using this method? Is it possible to do better?

In [24]:
sum_cols = later_need_y.add(pd.Series(voting)).to_frame()
sum_cols[sum_cols[0] == 1].shape[0]/train.shape[0] # fraction of updated labes using CVCF with xgboost

0.34955928871963393

In the first case, when we used decision trees we actually started mislabling data even more because of the bad classification accuracy of our decisiontree model - ~52%. This led to decrease of final regression accuracy on that data.

Then we used tuned xgboost classifier with better accuracy and we got significant improvement in final regression accuracy on our data. CVCF on xgboost base updated 35.44% of noise data which is even more than we initially imputed in our data (10%). That is why we have better accuracy than on initial train dataset. If we go further with tuning we can do even better.

In [25]:
# 5.4
CVCF_comp_table = pd.DataFrame(data=CVCF_results)
CVCF_comp_table['10%noise'] = np.array(misclf_results[0.1])
CVCF_comp_table['original'] = np.array([orig_train_score, orig_test_score, orig_cv_scores.mean()])

CVCF_comp_table

Unnamed: 0,CVCF_DesTrees,CVCF_xgb,10%noise,original
0,0.657412,0.876079,0.680385,0.772826
1,0.768687,0.800934,0.765923,0.775444
2,0.697085,0.818678,0.714754,0.779984
