In [3]:
!pip3 install path

Collecting path
  Downloading https://files.pythonhosted.org/packages/cb/81/b9090d24e60369fd9413b92fcd87e13a37bf43dad3427d35e09915f788ac/path-15.0.0-py3-none-any.whl
Installing collected packages: path
Successfully installed path-15.0.0


In [7]:

from path import Path
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt

In [2]:
# Import our input dataset
from google.colab import files
uploaded = files.upload()

Saving bcsc_risk_factors_summarized1_092020.csv to bcsc_risk_factors_summarized1_092020.csv
Saving bcsc_risk_factors_summarized3_092020.csv to bcsc_risk_factors_summarized3_092020.csv
Saving bcsc_risk_factors_summarized2_092020.csv to bcsc_risk_factors_summarized2_092020.csv


In [8]:
import io

risk_factor1= pd.read_csv(io.BytesIO(uploaded['bcsc_risk_factors_summarized1_092020.csv']))
risk_factor2= pd.read_csv(io.BytesIO(uploaded['bcsc_risk_factors_summarized3_092020.csv']))
risk_factor3= pd.read_csv(io.BytesIO(uploaded['bcsc_risk_factors_summarized2_092020.csv']))


In [9]:
merged_df= pd.concat([risk_factor1,risk_factor2,risk_factor3])

In [10]:
merged_df.columns

Index(['year', 'age_group_5_years', 'race_eth', 'first_degree_hx',
       'age_menarche', 'age_first_birth', 'BIRADS_breast_density',
       'current_hrt', 'menopaus', 'bmi_group', 'biophx',
       'breast_cancer_history', 'count'],
      dtype='object')

In [11]:
merged_df.dtypes

year                     int64
age_group_5_years        int64
race_eth                 int64
first_degree_hx          int64
age_menarche             int64
age_first_birth          int64
BIRADS_breast_density    int64
current_hrt              int64
menopaus                 int64
bmi_group                int64
biophx                   int64
breast_cancer_history    int64
count                    int64
dtype: object

In [12]:
#eliminate breast cancer diagnosis unknown 

merged_df= merged_df.loc[merged_df['breast_cancer_history'] != 9]

In [13]:
merged_df['breast_cancer_history'].nunique()

2

In [14]:
#define the feature set

X= merged_df.copy()
X= X.drop(['breast_cancer_history','year'], axis=1)
X.head()

Unnamed: 0,age_group_5_years,race_eth,first_degree_hx,age_menarche,age_first_birth,BIRADS_breast_density,current_hrt,menopaus,bmi_group,biophx,count
0,7,1,0,9,3,1,1,2,3,0,7
1,7,1,0,9,3,1,1,2,3,1,3
2,7,1,0,9,3,1,1,2,4,0,6
3,7,1,0,9,3,1,1,2,4,1,1
4,7,1,0,9,3,1,1,2,4,1,1


In [15]:
#define target set
y= merged_df['breast_cancer_history'].ravel()

In [16]:
# Splitting into Tran and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create a rando forest classifier
rf_model= RandomForestClassifier(n_estimators= 128)

In [19]:
#fitting the model 
rf_model= rf_model.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [22]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,223292,19533
Actual 1,24641,30394


In [23]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [24]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,223292,19533
Actual 1,24641,30394


Accuracy Score : 0.8516954273819916
Classification Report
              precision    recall  f1-score   support

           0       0.90      0.92      0.91    242825
           1       0.61      0.55      0.58     55035

    accuracy                           0.85    297860
   macro avg       0.75      0.74      0.74    297860
weighted avg       0.85      0.85      0.85    297860



In [25]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.14135996, 0.08303595, 0.04577236, 0.06699229, 0.13447084,
       0.11037339, 0.04979091, 0.03590715, 0.09274862, 0.14743048,
       0.09211805])

In [26]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.14743048145978196, 'biophx'),
 (0.14135995845524404, 'age_group_5_years'),
 (0.13447083623718967, 'age_first_birth'),
 (0.11037339310704757, 'BIRADS_breast_density'),
 (0.092748621656338, 'bmi_group'),
 (0.09211805035404136, 'count'),
 (0.08303594748328931, 'race_eth'),
 (0.06699228937540021, 'age_menarche'),
 (0.04979091360512541, 'current_hrt'),
 (0.045772355922886324, 'first_degree_hx'),
 (0.035907152343656205, 'menopaus')]