In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
df = pd.read_csv('./bc_dataset.csv')
df.head()

In [None]:
clms = df.columns
print(clms)
### remove id column and unnamed column
df = df.drop(['id', 'Unnamed: 32'], axis=1)

### checking null values
df.isnull().sum()

### checking info 
df.info()

In [None]:
df.describe()

In [None]:
### rename target column
df = df.rename(columns={'diagnosis': 'target'})
### target column replace with 0 and 1
df['target'] = df['target'].replace({'B': 0, 'M': 1})

In [None]:
### Define a color palette (you can customize this list of colors)
colors = ["#1f77b4", "#ff7f0e"]

### Create the plot with different colors for each bar
ax = sns.countplot(x='target', data=df, palette=colors)

### Add count on each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), 
                textcoords='offset points')

### Show the plot
plt.show()

In [None]:
### check correlation with target
corr_metrix = df.corr()
corr_metrix['target']*100

In [None]:
### correlation matrix in heatmap
plt.figure(figsize=(20,15))
sns.heatmap(corr_metrix, annot=True,cmap='RdYlGn')

In [None]:
### divide dependent and independent variables
x = df.drop(["target"], axis = 1)
y = df.target

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(sampling_strategy="minority", random_state=42, k_neighbors=10, n_jobs=-1)
x, y = oversample.fit_resample(x, y)

In [None]:
### let's divide train and test samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)
print(len(X_train), len(X_test))
print(len(X_train)+len(X_test))

In [None]:
len(y)
l0, l1 = 0, 0
for i in y:
    if i==0:
        l0 += 1
    else:
        l1 += 1
print(l0,l1)

In [None]:
### Random Forest classification
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, criterion='gini', max_features='sqrt')
rf.fit(X_train,y_train)
rf_train_acc = rf.score(X_train,y_train)*100
rf_test_acc = rf.score(X_test,y_test)*100
print('Training Acc-> ',rf_train_acc)
print('Testing Acc -> ', rf_test_acc)

In [None]:
Score = {}
n_estimators = [10,20,30,50,75]
criterion = ['gini', 'entropy', 'log_loss']
max_features = ['sqrt', 'log2']
for n in n_estimators:
    for c in criterion:
        for mf in max_features:
            model = RandomForestClassifier(n_estimators=n,criterion=c,max_features=mf)
            model.fit(X_train,y_train)
            acc = model.score(X_test,y_test)*100
            Score[n,c,mf] = acc
            
max(Score.values()) # 85.15625
tunedVal = max(Score,key=Score.get)
tunedVal,max(Score.values())

In [None]:
# # hyper parameter tuning of random forest

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# import matplotlib.pyplot as plt

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)


# from sklearn.model_selection import GridSearchCV
# grid_param = {
#     'n_estimators' : [10,20,30,50,75],
#     'criterion' : ['gini', 'entropy', 'log_loss'],
#     'max_depth' : [7, 10, 13, 15],
#     'class_weight': ['balanced', 
#                     #  'balanced_subsample'
#                      ],
#     'min_samples_leaf' : [1, 3, 5, 7],
#     'min_samples_split' : [2, 3, 5, 7],
#     'max_features' : ['sqrt', 'log2']
# }

# grid_search_rf = GridSearchCV(rf, grid_param, cv = 5, n_jobs = -1, verbose = 3)
# grid_search_rf.fit(X_train, y_train)



# # best parameters and best score
# print(grid_search_rf.best_params_)
# print(grid_search_rf.best_score_)
# print(grid_search_rf.best_estimator_)

In [None]:
### Random Forest classification after imabalnce and grid search
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight='balanced',n_estimators=50,criterion='gini',max_depth=10,max_features='sqrt',
                            # min_samples_leaf=3,# min_samples_split=2,
                        )
rf.fit(X_train,y_train)
rf_train_acc = rf.score(X_train,y_train)*100
rf_test_acc = rf.score(X_test,y_test)*100
print('Training Acc-> ',rf_train_acc)
print('Testing Acc -> ', rf_test_acc)

In [None]:
y_pred = rf.predict(X_test)
print(y_pred)
from sklearn.metrics import confusion_matrix, classification_report,mean_absolute_error,mean_squared_error
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')
print(classification_report(y_test, y_pred))

In [None]:
f1_score = 2*(0.97*1/(0.97+1))
f1_score

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print('mae => ', mae)
print('mse => ', mse)
print('rmse => ', rmse)

In [None]:
# SAVING THE MODEL USING PICKLE PACKAGE
import pickle
# save the iris classification model as a pickle file
model_pkl_file = "./bc-rf.pkl"
with open(model_pkl_file, 'wb') as file:  
    pickle.dump(rf, file)

In [None]:
# evaluate model 
# LOAD AND USE THE SAVED MODEL USING PICKLE PACKAGE
with open(model_pkl_file, 'rb') as file:  
    loaded_rf = pickle.load(file)
    y_pred = loaded_rf.predict(X_test)
    # check results
    pred = loaded_rf.score(X_test, y_test)
    print(f"Accuracy : {pred * 100}%")