In [1]:
import json
import os
import pandas as pd

# Define path
file_path = r"C:\Users\kyram\OneDrive\School\Prepossesed Data.json"

# Confirm the file exists
if not os.path.exists(file_path):
    print(f"The file does not exist at the specified path: {file_path}")
else:
    # Load the JSON 
    with open(file_path, 'r') as file:
        preprocessed_data = json.load(file)

    # Print the keys
    print(preprocessed_data.keys())

    # Access the key
    if 'public' in preprocessed_data:
        public_data = preprocessed_data['public']
        # Convert the dictionary back to a DataFrame
        df = pd.DataFrame.from_dict(public_data)
        print(df)
    else:
        print("'public' key not found in the JSON file")


dict_keys(['public'])
       Age   AsAm  AmerInd  AvgDrinksPerWeek  Black  DrinkDaysPerWeek  \
0     83.0  False      0.0               0.0    0.0               0.0   
1     58.0  False      0.0              35.0    0.0               7.0   
2     40.0  False      0.0               0.0    0.0               0.0   
3     66.0  False      0.0               0.0    0.0               0.0   
4     39.0  False      0.0               0.0    1.0               0.0   
...    ...    ...      ...               ...    ...               ...   
6247  48.0  False      0.0               4.0    0.0               2.0   
6248  50.0  False      0.0               0.0    0.0               0.0   
6249  55.0  False      0.0               3.0    1.0               1.0   
6250  62.0  False      0.0              84.0    0.0               7.0   
6251  57.0  False      0.0               0.0    0.0               0.0   

      DrinksOneOccasion  DrinksPerDay  Education  EverHadCancer  ...  \
0                   0.0      

In [2]:
# Import the necessary libraries
import imblearn
import numpy as np 
import sklearn 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [3]:
# Define X and y
X = df.copy().drop(['EverHadCancer'], axis=1)

y = df['EverHadCancer']

In [4]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Resample
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)


In [6]:
# Build the model
model = RandomForestClassifier(max_depth=10, random_state=42)
model.fit(X_train, y_train)

print("Model training complete")

# Use the model to predict y 
y_pred = model.predict(X_test)

# Veiw Classification Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))

Model training complete
Accuracy: 0.7410071942446043
Precision: 0.8233834891017981
Recall: 0.7410071942446043
Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.77      0.84      1071
         1.0       0.29      0.56      0.38       180

    accuracy                           0.74      1251
   macro avg       0.60      0.67      0.61      1251
weighted avg       0.82      0.74      0.77      1251

Confusion Matrix: [[826 245]
 [ 79 101]]


In [8]:
# Define the feature importance
importances = model.feature_importances_

# Convert feature importance to a DataFrame
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

print(feature_importance_df)


                                              feature  importance
0                                                 Age    0.246302
10                                     FreqGoProvider    0.075496
41                         DocTellColorectalTests_Yes    0.052696
39  DocTellColorectalTests_I have never discussed ...    0.036784
15                                       IncomeRanges    0.030436
17                               MedConditions_HighBP    0.029299
29                                              White    0.027029
8                                           Education    0.025222
3                                    AvgDrinksPerWeek    0.024165
9                                FamilyEverHadCancer2    0.022531
28                                        WhenPapTest    0.022377
14                                               Hisp    0.021751
11                                   HealthInsurance2    0.019903
5                                    DrinkDaysPerWeek    0.019038
26        

In [9]:
# Convert DataFrame to a dictionary
RF_importance = {
    'public': feature_importance_df.to_dict(), 
}
# Define the save path
save_path = r"C:\Users\kyram\OneDrive\School\RF3_importance.json"

# Confirm the directory exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save the data to a JSON file
with open(save_path, 'w') as file:
    json.dump(RF_importance, file)

print(f"Data successfully saved to {save_path}")

Data successfully saved to C:\Users\kyram\OneDrive\School\RF3_importance.json
