<a href="https://colab.research.google.com/github/Goge052215/C17-Tutorial/blob/main/Lab6_P2_George_Edmund.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Logistic Regression - Census

In this challenge exercise, you will be working with census data and apply K Nearest Neighbor and Logistic Regression. This census dataset contains both people from the US and abroad (http://archive.ics.uci.edu/dataset/2/adult).


The goal is to find a way to predict who will make more than $50K per year and who will not. What features to do feel should be relevant? (Hint: If the non-numerical data is too hard to work with, consier throwing it out) Is that what you observe? Compare and contrast using K nearest Neighbor and Logistic Regression classifiers. Which model is a better predictor? Which features are important? What conclusions can you draw?


### Import the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install ucimlrepo
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression



### Import the census data set
The dataset we will be using is from the 1994 census database.

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)

print(X.shape)
print(y.shape)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [None]:
print(X)
print(y)

       age         workclass  fnlwgt  education  education-num  \
0       39         State-gov   77516  Bachelors             13   
1       50  Self-emp-not-inc   83311  Bachelors             13   
2       38           Private  215646    HS-grad              9   
3       53           Private  234721       11th              7   
4       28           Private  338409  Bachelors             13   
...    ...               ...     ...        ...            ...   
48837   39           Private  215419  Bachelors             13   
48838   64               NaN  321403    HS-grad              9   
48839   38           Private  374983  Bachelors             13   
48840   44           Private   83891  Bachelors             13   
48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation    relationship  \
0           Never-married       Adm-clerical   Not-in-family   
1      Married-civ-spouse    Exec-managerial         Husband   
2              

In [None]:
# This converts y to binary labels
y = np.where(np.logical_or(y == '>50K', y == '>50K.'), 1, 0)  # Assuming '>50K' is the positive class; NOTE: some are written as 50K some are 50K.
y = np.reshape(y, (len(y), ))

In [None]:
print(y)
print(X.shape)
print(y.shape)

[0 0 0 ... 0 0 1]
(48842, 14)
(48842,)


In [None]:
x_age = np.reshape(X.age, (len(X.age), ))
print(x_age)
print(x_age.shape)

[39 50 38 ... 38 44 35]
(48842,)


In [None]:
# transform all categorical data into numerical data
x_workclass = np.reshape(X.workclass, (len(X.workclass), ))
for i, element in enumerate(x_workclass):
  if element == 'Private':
    x_workclass[i] = 1
  elif element == 'Self-emp-not-inc':
    x_workclass[i] = 2
  elif element == 'Self-emp-inc':
    x_workclass[i] = 3
  elif element == 'Federal-gov':
    x_workclass[i] = 4
  elif element == 'Local-gov':
    x_workclass[i] = 5
  elif element == 'State-gov':
    x_workclass[i] = 6
  elif element == 'Without-pay':
    x_workclass[i] = 7
  elif element == 'Never-worked':
    x_workclass[i] = 8
  else:
    x_workclass[i] = 0
print(x_workclass)
print(x_workclass.shape)


[6 2 1 ... 1 1 3]
(48842,)


In [None]:
x_fnlwgt = np.reshape(X.fnlwgt, (len(X.fnlwgt), ))
print(x_fnlwgt)
print(x_fnlwgt.shape)

[ 77516  83311 215646 ... 374983  83891 182148]
(48842,)


In [None]:
# drop "education" column. already categorized to number in education-num
X = X.drop(columns='education')

In [None]:
# transfrom marital status data into numerical data

x_marital_status = np.reshape(X['marital-status'], (len(X['marital-status']), ))
for i, element in enumerate(x_marital_status):
  if element == 'Married-civ-spouse':
    x_marital_status[i] = 1
  elif element == 'Divorced':
    x_marital_status[i] = 2
  elif element == 'Never-married':
    x_marital_status[i] = 3
  elif element == 'Separated':
    x_marital_status[i] = 4
  elif element == 'Widowed':
    x_marital_status[i] = 5
  elif element == 'Married-spouse-absent':
    x_marital_status[i] = 6
  elif element == 'Married-AF-spouse':
    x_marital_status[i] = 7
  else:
    x_marital_status[i] = 0

print(x_marital_status)
print(x_marital_status.shape)

[3 1 2 ... 1 2 1]
(48842,)


In [None]:
# transform occupation data into numerical data

x_occupation = np.reshape(X['occupation'], (len(X['occupation']), ))
for i, element in enumerate(x_occupation):
  if element == 'Tech-support':
    x_occupation[i] = 1
  elif element == 'Craft-repair':
    x_occupation[i] = 2
  elif element == 'Other-service':
    x_occupation[i] = 3
  elif element == 'Sales':
    x_occupation[i] = 4
  elif element == 'Exec-managerial':
    x_occupation[i] = 5
  elif element == 'Prof-specialty':
    x_occupation[i] = 6
  elif element == 'Handlers-cleaners':
    x_occupation[i] = 7
  elif element == 'Machine-op-inspct':
    x_occupation[i] = 8
  elif element == 'Adm-clerical':
    x_occupation[i] = 9
  elif element == 'Farming-fishing':
    x_occupation[i] = 10
  elif element == 'Transport-moving':
    x_occupation[i] = 11
  elif element == 'Priv-house-serv':
    x_occupation[i] = 12
  elif element == 'Protective-serv':
    x_occupation[i] = 13
  elif element == 'Armed-Forces':
    x_occupation[i] = 14
  else:
    x_occupation[i] = 0
print(x_occupation)
print(x_occupation.shape)

[9 5 7 ... 6 9 5]
(48842,)


In [None]:
# transform relationship data into numerical data

x_relationship = np.reshape(X['relationship'], (len(X['relationship']), ))
for i, element in enumerate(x_relationship):
  if element == 'Wife':
    x_relationship[i] = 1
  elif element == 'Own-child':
    x_relationship[i] = 2
  elif element == 'Husband':
    x_relationship[i] = 3
  elif element == 'Not-in-family':
    x_relationship[i] = 4
  elif element == 'Other-relative':
    x_relationship[i] = 5
  elif element == 'Unmarried':
    x_relationship[i] = 6
  else:
    x_relationship[i] = 0
print(x_relationship)
print(x_relationship.shape)

[4 3 4 ... 3 2 3]
(48842,)


In [None]:
# transform race data into numerical data

x_race = np.reshape(X['race'], (len(X['race']), ))
for i, element in enumerate(x_race):
  if element == 'White':
    x_race[i] = 1
  elif element == 'Asian-Pac-Islander':
    x_race[i] = 2
  elif element == 'Amer-Indian-Eskimo':
    x_race[i] = 3
  elif element == 'Other':
    x_race[i] = 4
  elif element == 'Black':
    x_race[i] = 5
  else:
    x_race[i] = 0
print(x_race)
print(x_race.shape)

[1 1 1 ... 1 2 1]
(48842,)


In [None]:
# transform sex data into numerical data

x_sex = np.reshape(X['sex'], (len(X['sex']), ))
for i, element in enumerate(x_sex):
  if element == 'Male':
    x_sex[i] = 1
  elif element == 'Female':
    x_sex[i] = 2
  else:
    x_sex[i] = 0
print(x_sex)
print(x_sex.shape)

[1 1 1 ... 1 1 1]
(48842,)


In [None]:
# transform native-country data into numerical data

x_native_country = np.reshape(X['native-country'], (len(X['native-country']), ))
for i, element in enumerate(x_native_country):
  if element == 'United-States':
    x_native_country[i] = 1
  elif element == 'Cambodia':
    x_native_country[i] = 2
  elif element == 'England':
    x_native_country[i] = 3
  elif element == 'Puerto-Rico':
    x_native_country[i] = 4
  elif element == 'Canada':
    x_native_country[i] = 5
  elif element == 'Germany':
    x_native_country[i] = 6
  elif element == 'Outlying-US(Guam-USVI-etc)':
    x_native_country[i] = 7
  elif element == 'India':
    x_native_country[i] = 8
  elif element == 'Japan':
    x_native_country[i] = 9
  elif element == 'Greece':
    x_native_country[i] = 10
  elif element == 'South':
    x_native_country[i] = 11
  elif element == 'China':
    x_native_country[i] = 12
  elif element == 'Cuba':
    x_native_country[i] = 13
  elif element == 'Iran':
    x_native_country[i] = 14
  elif element == 'Honduras':
    x_native_country[i] = 15
  elif element == 'Philippines':
    x_native_country[i] = 16
  elif element == 'Italy':
    x_native_country[i] = 17
  elif element == 'Poland':
    x_native_country[i] = 18
  elif element == 'Jamaica':
    x_native_country[i] = 19
  elif element == 'Vietnam':
    x_native_country[i] = 20
  elif element == 'Mexico':
    x_native_country[i] = 21
  elif element == 'Portugal':
    x_native_country[i] = 22
  elif element == 'Ireland':
    x_native_country[i] = 23
  elif element == 'France':
    x_native_country[i] = 24
  elif element == 'Dominican-Republic':
    x_native_country[i] = 25
  elif element == 'Laos':
    x_native_country[i] = 26
  elif element == 'Ecuador':
    x_native_country[i] = 27
  elif element == 'Taiwan':
    x_native_country[i] = 28
  elif element == 'Haiti':
    x_native_country[i] = 29
  elif element == 'Columbia':
    x_native_country[i] = 30
  elif element == 'Hungary':
    x_native_country[i] = 31
  elif element == 'Guatemala':
    x_native_country[i] = 32
  elif element == 'Nicaragua':
    x_native_country[i] = 33
  elif element == 'Scotland':
    x_native_country[i] = 34
  elif element == 'Thailand':
    x_native_country[i] = 35
  elif element == 'Yugoslavia':
    x_native_country[i] = 36
  elif element == 'El-Salvador':
    x_native_country[i] = 37
  elif element == 'Trinadad&Tobago':
    x_native_country[i] = 38
  elif element == 'Peru':
    x_native_country[i] = 39
  elif element == 'Hong':
    x_native_country[i] = 40
  elif element == 'Holand-Netherlands':
    x_native_country[i] = 41
  else:
    x_native_country[i] = 0
print(x_native_country)
print(x_native_country.shape)

[1 1 1 ... 1 1 1]
(48842,)


In [None]:
# Cleaned data

# Extract the different components of the dataset
feature_names = list(X.columns)
target_names = ['<=50K', '>50K']
X = X.to_numpy()
y = y

# Print the shapes of the extracted components
print("X shape:", X.shape)
print("Y shape:", y.shape)

# Print the names of the features
print("Feature names:", feature_names)

# Print the names of the target classes
print("Target names:", target_names)

# Print features and target values
print(X)
print(y)

X shape: (48842, 13)
Y shape: (48842,)
Feature names: ['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
Target names: ['<=50K', '>50K']
[[39 6 77516 ... 0 40 1]
 [50 2 83311 ... 0 13 1]
 [38 1 215646 ... 0 40 1]
 ...
 [38 1 374983 ... 0 50 1]
 [44 1 83891 ... 0 40 1]
 [35 3 182148 ... 0 60 1]]
[0 0 0 ... 0 0 1]


###Your exploration
This is where you will start your work ... Start by looking at what the features are in X and deciding which ones you want

In [None]:
df = pd.DataFrame(data=adult.data, columns=feature_names)

df['target'] = adult.data.target
df['target_name'] = df['target'].map(dict(enumerate(target_names)))

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,target_name
