<a href="https://colab.research.google.com/github/HiwaTase/Machine-Learning/blob/main/Project3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1**

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, accuracy_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
url = 'https://raw.githubusercontent.com/HiwaTase/Machine-Learning/main/healthINS_balanced.csv'
df = pd.read_csv(url)

# Check for missing
print("Missing values in each column:")
print(df.isnull().sum())

# Drop columns with missing values
df = df.drop(["marst","WKSWORK2", "classwkr", "empstat", "vetstat","school","cit2","metro","region"], axis=1)

# Top-code the 'inctot' column
df['inctot'] = df['inctot'].apply(lambda x: min(max(x, 0), 500000))

# Verify the remaining missing values
print("Missing values after cleaning:")
print(df.isnull().sum())

# Split the data into features and target variable
X = df.drop('nohealthins', axis=1)
y = df['nohealthins']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Verify the split is balanced
print("Training set target class distribution:")
print(y_train.value_counts())
print("\nTesting set target class distribution:")
print(y_test.value_counts())

Missing values in each column:
nohealthins        0
marst              0
race2              0
WKSWORK2       24857
classwkr       16731
empstat          944
inctot             0
uhrswork           0
age                0
school             0
cit2               0
educ_att           0
female             0
metro              0
region             0
vetstat         1978
dtype: int64
Missing values after cleaning:
nohealthins    0
race2          0
inctot         0
uhrswork       0
age            0
educ_att       0
female         0
dtype: int64
Training set target class distribution:
nohealthins
1    29704
0    26184
Name: count, dtype: int64

Testing set target class distribution:
nohealthins
1    7427
0    6546
Name: count, dtype: int64


In [6]:
# Assume df is loaded and cleaned as per your previous project description
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Define feature categories
categorical_features = ['race2', 'educ_att']
numerical_features = ['inctot', 'age', 'uhrswork']

# Column transformer setup
column_transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

# Split the data into training and testing sets
X = df.drop('nohealthins', axis=1)
y = df['nohealthins']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Apply transformations
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)


**Step 2**

In [None]:
# Define parameter grids for grid search
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': [ 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None]
}

# Perform grid search for Random Forest
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='f1')
rf_grid_search.fit(X_train_transformed, y_train)

# Get the best Random Forest model
best_rf = rf_grid_search.best_estimator_

# Initialize PCA and the best classifier
pca = PCA(n_components=0.95)
classifier = RandomForestClassifier(**rf_grid_search.best_params_)

# Create a pipeline
pipeline = Pipeline(steps=[('pca', pca), ('classifier', classifier)])

# Fit the model
pipeline.fit(X_train_transformed, y_train)

# Evaluate the model
print("Explained Variance Ratio:", pca.explained_variance_ratio_.sum())

# Predict and calculate accuracy
y_pred = pipeline.predict(X_test_transformed)
print("Accuracy:", accuracy_score(y_test, y_pred))