In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [4]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [5]:
X.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


In [7]:
X.shape

(48842, 14)

In [8]:
X.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [9]:
X.size

683788

In [10]:
print(X.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')


In [11]:
X['workclass'] = X['workclass'].replace({'State-gov': 'Govt', 
                                         'Federal-gov': 'Govt', 
                                         'Local-gov': 'Govt','?':np.nan,
                                         'Self-emp-not-inc':'SelfEmployee',
                                         'Self-emp-inc':'SelfEmployee',
                                         'Never-worked':'Without-pay'})
X['workclass'] = X['workclass'].replace(mapping)



NameError: name 'mapping' is not defined

In [None]:
print(X['workclass'].value_counts())


workclass
Private         33906
Govt             6549
SelfEmployee     5557
WithoutPay         31
Name: count, dtype: int64


In [None]:

marital_mapping = {
    "Never-married": "Single",
    "Separated": "Single",
    "Widowed": "Single",
    "Married-spouse-absent": "Single",
    "Married-civ-spouse": "Couple",
    "Married-AF-spouse": "Couple"
}

X['marital-status'] = X['marital-status'].replace(marital_mapping)

In [None]:
print(X['marital-status'].unique())
print(X['marital-status'].value_counts())

['Single' 'Couple' 'Divorced']
marital-status
Couple      22416
Single      19793
Divorced     6633
Name: count, dtype: int64


In [None]:
X['native-country'] = X['native-country'].apply(
    lambda x: "US" if x == "United-States" else "Non-US"
)




In [None]:
print(X['native-country'].unique())
print(X['native-country'].value_counts())

['US' 'Non-US']
native-country
US        43832
Non-US     5010
Name: count, dtype: int64


In [None]:

for col in X.columns:
    if X[col].dtype == 'object':  
        X[col] = X[col].replace("?", np.nan)   
        X[col] = X[col].fillna(X[col].mode()[0])  
    else:
        X[col] = X[col].fillna(X[col].mode()[0])  


In [None]:
print(X.isnull().sum())


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64


In [None]:
X['education'] = X['education'].replace(['HS-grad','10th','9th'], "High-Scl")
X['education'] = X['education'].replace(['5th-6th','7th-8th'], "Middel-Scl")
X['education'] = X['education'].replace(['1st-4th', 'Preschool'], "Primary-Scl")
X['education'] = X['education'].replace(['11th','12th'], "Higer_sec-Scl")
X['education'] = X['education'].replace(['Some-college','Prof-school', 'Bachelors'], "College")
X['education'] = X['education'].replace(['Assoc-acdm', 'Assoc-voc'],"Other")

In [None]:
X['education'].unique()

array(['College', 'High-Scl', 'Higer_sec-Scl', 'Masters', 'Other',
       'Middel-Scl', 'Doctorate', 'Primary-Scl'], dtype=object)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
cat_col = X.select_dtypes(include='object').columns
cat_col

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

In [None]:
for col in cat_col:
    X[col] = le.fit_transform(X[col])

In [None]:
for col in cat_col:
    X[col] = le.fit_transform(X[col])

In [None]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,0,77516,0,13,4,1,1,4,1,2174,0,40,39
1,50,2,83311,0,13,2,4,0,4,1,0,0,13,39
2,38,1,215646,3,9,0,6,1,4,1,0,0,40,39
3,53,1,234721,2,7,2,6,0,2,1,0,0,40,39
4,28,1,338409,0,13,2,10,5,2,0,0,0,40,5


In [None]:
X = X.iloc[:,:-1]
y = X.iloc[:,-1]

In [None]:
df_encoded = pd.get_dummies(X, drop_first=True)

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
lr = LogisticRegression()

In [None]:
kfold = KFold(n_splits= 5)

In [None]:
score = cross_val_score(lr,X_train, y_train, cv=kfold)


In [None]:
score

array([0.46461932, 0.46180422, 0.46794626, 0.47056565, 0.45674431])

In [None]:
score.mean()

np.float64(0.46433595163036434)

In [None]:
lr.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
lr.score(X_train, y_train)

0.4643103933662632

In [None]:
pred = lr.predict(X_test)

In [None]:
lr.score(X_test, y_test)

0.474767120483161

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.474767120483161