In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix



```
 **Target filed: Income
-- The income is divide into two classes: <=50K and >50K**
```



In [25]:
#Read CSV file
# data = pd.read_csv('/content/drive/MyDrive/Adult income/adult.csv')
data = pd.read_csv('datasets/adult_income/adult.csv')

In [26]:
#show data
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K





---


Workclass" is a categorical feature used in the
field of machine learning and data mining. It represents the type of organization or occupation that a person is employed in. The possible values for workclass include:
  Private: employee of a private company
  Self-emp-not-inc: self-employed individuals such as small business owners like a pharmacy or a small shop
  Self-emp-inc: self-employed individuals with higher income such as large corporations
  Federal-gov: federal government employee
  Local-gov: local government employee
  State-gov: state government employee
  Without-pay: individual without income
  Never-worked: individual who has never worked before

  This feature is important in income prediction models, as the type of occupation and organization that a person is employed in can have an impact on their income.





---

 Weighting data in a sample is commonly used to provide a better estimate of the population. For example, if a particular group with a smaller number of individuals from the population is underrepresented in the sample, it may also be underrepresented in the final estimate. Therefore, by assigning a weight to each individual in the sample, a better estimate for features such as income, education, age, and gender can be obtained.

In machine learning models, fnlwgt is also used as one of the input features in predicting income and other demographic features of the population.





---

Capital-gain" is a feature in financial and economic datasets that represents the profit earned by an individual or business through the sale of a capital asset such as stocks, bonds, or real estate. Capital gain is calculated as the difference between the sale price of the asset and its original purchase price.

In machine learning models, capital gain is often used as a predictor of income, as individuals with higher capital gains tend to have higher incomes. This feature can also be used to predict other financial behaviors, such as investment decisions and risk-taking behavior.Capital gain is an important feature in financial analysis and modeling, as it can provide insights into the overall financial health and performance of individuals and businesses.

---



# Get more Information

In [27]:
data.shape

(48842, 15)

In [28]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [30]:
#seperate 
non_numeric_columns = data.select_dtypes(include=['object']).columns
numeric_columns = data.select_dtypes(exclude=['object']).columns
non_numeric_columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country', 'income'],
      dtype='object')

In [31]:
print('predclass',data.income.unique())

predclass ['<=50K' '>50K']


In [32]:
#Get more index
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


# Preprocess

In [33]:
data["education"].unique()

array(['11th', 'HS-grad', 'Assoc-acdm', 'Some-college', '10th',
       'Prof-school', '7th-8th', 'Bachelors', 'Masters', 'Doctorate',
       '5th-6th', 'Assoc-voc', '9th', '12th', '1st-4th', 'Preschool'],
      dtype=object)

In [34]:
#Limit categorization
data['education'].replace('Preschool', 'dropout',inplace=True)
data['education'].replace('10th', 'dropout',inplace=True)
data['education'].replace('11th', 'dropout',inplace=True)
data['education'].replace('12th', 'dropout',inplace=True)
data['education'].replace('1st-4th', 'dropout',inplace=True)
data['education'].replace('5th-6th', 'dropout',inplace=True)
data['education'].replace('7th-8th', 'dropout',inplace=True)
data['education'].replace('9th', 'dropout',inplace=True)
data['education'].replace('HS-Grad', 'HighGrad',inplace=True)
data['education'].replace('HS-grad', 'HighGrad',inplace=True)
data['education'].replace('Some-college', 'CommunityCollege',inplace=True)
data['education'].replace('Assoc-acdm', 'CommunityCollege',inplace=True)
data['education'].replace('Assoc-voc', 'CommunityCollege',inplace=True)
data['education'].replace('Bachelors', 'Bachelors',inplace=True)
data['education'].replace('Masters', 'Masters',inplace=True)
data['education'].replace('Prof-school', 'Masters',inplace=True)
data['education'].replace('Doctorate', 'Doctorate',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['education'].replace('Preschool', 'dropout',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['education'].replace('10th', 'dropout',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

In [36]:
data['education'].unique()

array(['dropout', 'HighGrad', 'CommunityCollege', 'Masters', 'Bachelors',
       'Doctorate'], dtype=object)

In [37]:
#Limit categorization
data['marital-status'].replace('Never-married', 'NotMarried',inplace=True)
data['marital-status'].replace(['Married-AF-spouse'], 'Married',inplace=True)
data['marital-status'].replace(['Married-civ-spouse'], 'Married',inplace=True)
data['marital-status'].replace(['Married-spouse-absent'], 'NotMarried',inplace=True)
data['marital-status'].replace(['Separated'], 'Separated',inplace=True)
data['marital-status'].replace(['Divorced'], 'Separated',inplace=True)
data['marital-status'].replace(['Widowed'], 'Widowed',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['marital-status'].replace('Never-married', 'NotMarried',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['marital-status'].replace(['Married-AF-spouse'], 'Married',inplace=True)


In [38]:
#duplicated
data.duplicated().sum()

53

In [39]:
#remove duplicated row
data=data.drop_duplicates()

In [40]:
#replace ? to nan
data.replace('?', np.nan, inplace=True)

In [41]:
data.isna().sum()

age                   0
workclass          2794
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2804
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      856
income                0
dtype: int64

In [42]:
# Number of rows that have one null values
one_null = sum(data['workclass'].isnull() & ~data['occupation'].isnull() & ~data['native-country'].isnull()) \
           + sum(~data['workclass'].isnull() & data['occupation'].isnull() & ~data['native-country'].isnull()) \
           + sum(~data['workclass'].isnull() & ~data['occupation'].isnull() & data['native-country'].isnull())

# Number of rows that have two null values
two_null = sum(data['workclass'].isnull() & data['occupation'].isnull() & ~data['native-country'].isnull()) \
           + sum(data['workclass'].isnull() & ~data['occupation'].isnull() & data['native-country'].isnull()) \
           + sum(~data['workclass'].isnull() & data['occupation'].isnull() & data['native-country'].isnull())

# Number of rows that have three null values
three_null = sum(data['workclass'].isnull() & data['occupation'].isnull() & data['native-country'].isnull())

# Print the number of rows that have one, two and three null values
print('Number of rows that have one null values:', one_null)
print('Number of rows that have two null values:', two_null)
print('Number of rows that have three null values:', three_null)

Number of rows that have one null values: 820
Number of rows that have two null values: 2748
Number of rows that have three null values: 46


In [43]:
df = data

In [44]:
#drop beacuse they have nan
df['occupation'].dropna(inplace=True)
df['workclass'].dropna(inplace=True)


In [45]:
#drop educational-num beacuse its not important
df = df.drop(['educational-num'],axis=1)

In [46]:
#Encoder cetegorical columns
lb=LabelEncoder()
df.workclass=lb.fit_transform(df.workclass)
df.education=lb.fit_transform(df.education)
df['marital-status']=lb.fit_transform(df['marital-status'])
df.occupation=lb.fit_transform(df.occupation)
df.relationship=lb.fit_transform(df.relationship)
df.race=lb.fit_transform(df.race)
df.gender=lb.fit_transform(df.gender)
df['native-country']=lb.fit_transform(df['native-country'])
df.income=lb.fit_transform(df.income)


In [47]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,3,226802,5,1,6,3,2,1,0,0,40,38,0
1,38,3,89814,3,0,4,0,4,1,0,0,50,38,0
2,28,1,336951,1,0,10,0,4,1,0,0,40,38,1
3,44,3,160323,1,0,6,0,2,1,7688,0,40,38,1
4,18,8,103497,1,1,14,3,4,0,0,0,30,38,0


In [50]:
df.to_csv("datasets/adult_income/train_clean.csv")

In [53]:
df["income"].value_counts()

income
0    37108
1    11681
Name: count, dtype: int64

In [62]:
import numpy as np

In [66]:
zero_ids = np.where(df["income"] == 0)[0]
one_ids = np.where(df["income"] == 1)[0]

In [68]:
np.random.seed(2222)
zero_ids_1000 = np.random.choice(zero_ids, size=(1000,),replace=False)
np.random.seed(2222)
one_ids_1000 = np.random.choice(one_ids, size=(1000,),replace=False)

In [74]:
ids_2000 = np.concatenate([zero_ids_1000, one_ids_1000])
np.random.seed(2222)
np.random.shuffle(ids_2000)

In [78]:
df.iloc[ids_2000,:].to_csv("datasets/adult_income/train_2000.csv")

# Model

In [79]:
#X
X = df.drop('income',axis=1)

In [80]:
#y
y = df['income']

In [81]:
#scaler 
st=StandardScaler()
X_scalered=st.fit_transform(X)

In [84]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_scalered, y, random_state=30, test_size=0.1)

In [85]:
#find best k
k_range = range(1, 11)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())

print(k_scores)

[0.7387321016067303, 0.7884358759553983, 0.7628359266785945, 0.7921867664054792, 0.779294570256627, 0.7942979597863206, 0.786119886408948, 0.7954252147563493, 0.7910390070287258, 0.796941986226585]


In [86]:
#creat model
knn=KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

In [87]:
#pred
y_pred=knn.predict(X_test)

In [88]:
#Acuracy
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, knn.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Train set Accuracy:  0.8574128900022774
Test set Accuracy:  0.8356220536995286


In [89]:
#mae
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

0.1643779463004714

# another df1

In [90]:
#drop fnlwgt and native-country columns
df1 = df.drop(['fnlwgt', 'native-country'], axis=1)

In [91]:
#X and y
X1 = df1.drop('income',axis=1)
y1 = df1['income']

In [92]:
#scaler 
st=StandardScaler()
X_scalered1 = st.fit_transform(X1)

In [93]:
#split
X_train1,X_test1,y_train1,y_test1=train_test_split(X_scalered1, y1, random_state=20, test_size=0.1)

In [39]:
#find best k
k_range = range(1, 11)
k_scores1 = []

for k in k_range:
    knn1 = KNeighborsClassifier(n_neighbors=k)
    scores1 = cross_val_score(knn1, X1, y1, cv=10, scoring='accuracy')
    k_scores1.append(scores1.mean())

print(k_scores1)

[0.8170898683776754, 0.8390826177169334, 0.8369508947190312, 0.8433254122457191, 0.8413167030829971, 0.8449446553289063, 0.8433664042522778, 0.8454366140300058, 0.8450881441587524, 0.8468098336445549]


In [94]:
#creat model
knn1=KNeighborsClassifier(n_neighbors=10)
knn1.fit(X_train1, y_train1)

In [95]:
#pred
y_pred1=knn1.predict(X_test1)

from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train1, knn1.predict(X_train1)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test1, y_pred1))

Train set Accuracy:  0.8619904349806422
Test set Accuracy:  0.8423857347817175


In [96]:
confusion_matrix(y_test1,knn1.predict(X_test1))

array([[3485,  246],
       [ 523,  625]], dtype=int64)

In [97]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

In [98]:
#pred
y_pred1=rfc.predict(X_test)

from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, rfc.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Train set Accuracy:  0.9998405830107037
Test set Accuracy:  0.8356220536995286


In [1]:
import numpy as np
import copy

In [2]:
a = np.array([1,2,3])

In [3]:
b = a.copy()

In [4]:
a[0] = 121

In [5]:
a, b

(array([121,   2,   3]), array([1, 2, 3]))

In [1]:
import pandas as pd
df = pd.DataFrame(np.array([[1,2,3],[4,5,6]]))
df

ModuleNotFoundError: No module named 'pandas'

In [7]:
dfc = df.copy(deep=True)

In [8]:
df.iloc[0,0] = 121

In [9]:
df, dfc

(     0  1  2
 0  121  2  3
 1    4  5  6,
    0  1  2
 0  1  2  3
 1  4  5  6)

In [2]:
import numpy as np

In [3]:
X1 = np.arange(5000).reshape(500,10)
y1 = np.arange(500)

X2 = np.arange(10000,15000).reshape(500,10)
y2 = np.arange(1000, 1500)

In [4]:
X1.shape, y1.shape, X2.shape, y2.shape

((500, 10), (500,), (500, 10), (500,))

In [5]:
X=np.vstack([X1,X2])
X.shape

(1000, 10)

In [6]:
y=np.hstack([y1,y2])
y.shape

(1000,)

In [7]:
D = np.hstack([X, y.reshape(-1,1)])

In [8]:
Z = np.ones(D.shape[0], dtype=D.dtype)
Z[:500] = 0     

In [9]:
D

array([[    0,     1,     2, ...,     8,     9,     0],
       [   10,    11,    12, ...,    18,    19,     1],
       [   20,    21,    22, ...,    28,    29,     2],
       ...,
       [14970, 14971, 14972, ..., 14978, 14979,  1497],
       [14980, 14981, 14982, ..., 14988, 14989,  1498],
       [14990, 14991, 14992, ..., 14998, 14999,  1499]])

In [10]:
def array_mask(dataset: tuple[np.ndarray, np.ndarray], mask: np.ndarray):
    return dataset[0][mask], dataset[1][mask]

In [13]:
array_mask((D,Z), Z==1)[1].shape

(500,)