In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTEN

# Read the dataset from CSV
data = pd.read_csv('adult.csv')  # Replace 'adult.csv' with your actual file name

# Remove continuous columns (replace with columns you want to keep)
columns_to_drop = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
data = data.drop(columns=columns_to_drop)

# Separate features and target
X = data.drop('income', axis=1)  # Replace 'target_column_name' with your target column name
y = data['income']

# label encode the data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in X.columns:
    X[col] = le.fit_transform(X[col])


# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Decision Tree Classifier
#clf = DecisionTreeClassifier(random_state=42)
#clf.fit(X_train, y_train)

# Oversample using SMOTEN
smoten = SMOTEN(random_state=42)
X_train_resampled, y_train_resampled = smoten.fit_resample(X_train, y_train)

# count instances of y in each class
from collections import Counter
print(f"Original dataset shape {Counter(y_train)}")
print(f"Resampled dataset shape {Counter(y_train_resampled)}")

[[0.         1.08548324 4.02731675 ... 2.46935277 2.27958394 0.73983356]
 [1.08548324 0.         1.27535619 ... 0.39682517 0.82446918 0.16226952]
 [4.02731675 1.27535619 0.         ... 0.27231735 0.59810829 1.79866597]
 ...
 [2.46935277 0.39682517 0.27231735 ... 0.         0.51280374 0.84034313]
 [2.27958394 0.82446918 0.59810829 ... 0.51280374 0.         0.80943211]
 [0.73983356 0.16226952 1.79866597 ... 0.84034313 0.80943211 0.        ]]
Original dataset shape Counter({'<=50K': 29676, '>50K': 9397})
Resampled dataset shape Counter({'<=50K': 29676, '>50K': 29676})


In [9]:
# show last 5 rows of X_train
print(X_train_resampled.tail())

       workclass  education  educational-num  marital-status  occupation  \
59347          4          1                3               2          12   
59348          4         11                8               2          14   
59349          4         11                8               2           8   
59350          4         12               13               2           4   
59351          4          7               11               2          12   

       relationship  race  gender  native-country  
59347             0     4       1              39  
59348             0     4       1              39  
59349             0     4       1              39  
59350             0     4       1              39  
59351             0     4       1              39  


In [None]:

# Now you can use X_train_resampled and y_train_resampled for training your model
# and X_val, y_val for validation

# Example of how to evaluate the model on the validation set
#accuracy = clf.score(X_val, y_val)
#print(f"Validation Accuracy: {accuracy:.2f}")