In [14]:
import json
import os
import pandas as pd

# Define path
file_path = r"C:\Users\kyram\OneDrive\School\Prepossesed Data.json"

# Confirm the file exists
if not os.path.exists(file_path):
    print(f"The file does not exist at the specified path: {file_path}")
else:
    # Load the JSON 
    with open(file_path, 'r') as file:
        preprocessed_data = json.load(file)

    # Print the keys
    print(preprocessed_data.keys())

    # Access the key
    if 'public' in preprocessed_data:
        public_data = preprocessed_data['public']
        # Convert the dictionary back to a DataFrame
        df = pd.DataFrame.from_dict(public_data)
        print(df)
    else:
        print("'public' key not found in the JSON file")


dict_keys(['public'])
       Age   AsAm  AmerInd  AvgDrinksPerWeek  Black  DrinkDaysPerWeek  \
0     83.0  False      0.0               0.0    0.0               0.0   
1     58.0  False      0.0              35.0    0.0               7.0   
2     40.0  False      0.0               0.0    0.0               0.0   
3     66.0  False      0.0               0.0    0.0               0.0   
4     39.0  False      0.0               0.0    1.0               0.0   
...    ...    ...      ...               ...    ...               ...   
6247  48.0  False      0.0               4.0    0.0               2.0   
6248  50.0  False      0.0               0.0    0.0               0.0   
6249  55.0  False      0.0               3.0    1.0               1.0   
6250  62.0  False      0.0              84.0    0.0               7.0   
6251  57.0  False      0.0               0.0    0.0               0.0   

      DrinksOneOccasion  DrinksPerDay  Education  EverHadCancer  ...  \
0                   0.0      

In [15]:
# Import the necessary libraries
import imblearn
import numpy as np 
import sklearn 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [16]:
# Define X and y
X = df.copy().drop(['EverHadCancer', 'FamilyEverHadCancer2'], axis=1)

y = df['FamilyEverHadCancer2']

In [17]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
# Resample 
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)


In [19]:
# Build the model
model = RandomForestClassifier(max_depth=10, random_state=42)
model.fit(X_train, y_train)

print("Model training complete")

# Use the model to predict y 
y_pred = model.predict(X_test)

# Veiw Classification Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))



Model training complete
Accuracy: 0.7210231814548361
Precision: 0.713779038984668
Recall: 0.7210231814548361
Classification Report:
               precision    recall  f1-score   support

         0.0       0.64      0.53      0.58       452
         1.0       0.76      0.83      0.79       799

    accuracy                           0.72      1251
   macro avg       0.70      0.68      0.69      1251
weighted avg       0.71      0.72      0.71      1251

Confusion Matrix: [[240 212]
 [137 662]]


In [20]:
# Define the feature importance
importances = model.feature_importances_

# Convert feature importance to a DataFrame
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

print(feature_importance_df)

                                              feature  importance
0                                                 Age    0.078918
28                                              White    0.057697
29                                 BirthGender_Female    0.053425
11                                   HeardHPVVaccine2    0.050192
14                                       IncomeRanges    0.050024
9                                      FreqGoProvider    0.040416
8                                           Education    0.037003
47                                SmokeNow_Not at all    0.035554
12                            HPVCauseCancer_Cervical    0.033775
3                                    AvgDrinksPerWeek    0.031694
27                                        WhenPapTest    0.028564
5                                    DrinkDaysPerWeek    0.025421
13                                               Hisp    0.024972
36                                     DocTalkLDCT_No    0.024786
25        

In [21]:
# Convert DataFrame to a dictionary
RF_importance = {
    'public': feature_importance_df.to_dict(),  
}
# Define the save path
save_path = r"C:\Users\kyram\OneDrive\School\RF2_importance.json"

# Confirm the directory exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save the data to a JSON file
with open(save_path, 'w') as file:
    json.dump(RF_importance, file)

print(f"Data successfully saved to {save_path}")

Data successfully saved to C:\Users\kyram\OneDrive\School\RF2_importance.json
