In [1]:
# Import the useful libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import sys
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Normalizer
import warnings

warnings.filterwarnings("ignore")
sys.path.append('..')

In [None]:
# Importing the usefull modules from the script folder

from scripts import vizualization as viz
from scripts import utils

In [None]:
# Loading and reading the data

df = pd.read_csv("../data/data.csv")
df.columns

In [None]:
# Dropping the unwanted column

df.drop('Unnamed: 32', inplace=True, axis=1)
df.columns

In [None]:
# Observing the information about the dataframe

df.info()

In [None]:
# Trying to see the count mean and other useful attributes of the data 

df.describe()

In [None]:
# Checking for any missing value

df.isna().sum()

In [None]:
# Checking the raw and column of the data frame

df.shape

In [None]:
# Finding the unique values in the diagnosis column

df.diagnosis.value_counts()

In [None]:
# Vizualizing the diagnosis column

viz.count_plot(df,'diagnosis')

In [None]:
# Encoding the data frame that has string attribute

encoded_df = df.copy()
utils.encoding_data(encoded_df)

In [None]:
# Finding the correlation of all the features

utils.corr_matrix(encoded_df,'Correlation matrix of all the dataset','general_correlation.jpg')

In [None]:
# Finding the correlation of the highly correlated columns

columns_for_analysis = utils.find_high_corr(encoded_df)
utils.corr_matrix(encoded_df[columns_for_analysis],'Correlation matrix for highly related features','higher_correlation.jpg')

In [None]:
# Working with bivariate analysis given useful columns and vizualizing

mean_col = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

viz.plot_ditribution(df,mean_col)

In [None]:
columns = ["radius", "texture", "perimeter", "area", "smoothness", "compactness", "concavity", "concave points", "symmetry", "fractal_dimension"]
fields = ["mean", "se", "worst"]
viz.feature_vs_target(df,columns,fields)

In [None]:
# Vizualzing for an outlier

columns = ['texture_mean', 'radius_mean']
viz.plot_outlier(df,columns,'Outliers in texture_mean and radius_mean \n')

In [None]:
# Fixing the outlier

df_clean = utils.fix_outlier(df)

In [None]:
# Plotting the dataframe and check if there is any outlier

viz.plot_outlier(df_clean,columns, 'Outliers fixed in texture_mean and radius_mean \n')

In [None]:
# Scaling the dataframe for better model accuracy

scaled_df = utils.scaler(encoded_df)
scaled_df.head(3)

In [None]:
# Dropping the unwanted id column

new_df = scaled_df.drop(['id'],axis=1)
new_df.head()

In [None]:
# Creating features and target variable

X = new_df.drop(["diagnosis"], axis=1)
y = new_df["diagnosis"]

In [None]:
# Train test splitting

X_train, X_test, y_train, y_test = train_test_split(
    X, y,test_size=0.2, random_state=10
)

In [None]:
# Model defination

classifier = RandomForestClassifier(n_estimators=100)

In [None]:
# Model fitting

classifier.fit(X_train, y_train)

In [None]:
# Predicting using the model

y_pred = classifier.predict(X_test)

In [None]:
# Finding out the model accuracy

print ('Accuracy Score of the random forest regressor is :',accuracy_score(y_test, y_pred))

In [None]:
# Finding the feature importance

importances = classifier.feature_importances_
labels = X.columns
feature_df = pd.DataFrame(list(zip(labels, importances)), columns=["feature", "importance"])
feature_df = feature_df.sort_values(by='importance', ascending=False, )
significant_features_df = feature_df.head(10)
significant_features_df.shape
significant_features_df.head()

In [None]:
# Vizualizing the top 10 features 

viz.plot_bar(significant_features_df,'Top 10 features', 'top_10_features.jpg')

In [None]:
# After analysis and feature importance these features were considered as most important for the model

significant_feature_columns = ['concave points_mean','radius_worst','concave points_worst','perimeter_worst','area_mean','perimeter_mean','radius_mean']
final_df = new_df[significant_feature_columns]
final_df.head()

In [None]:
# Normalizing the data for better modeling 

norm = Normalizer()
clean_df = pd.DataFrame(norm.fit_transform(final_df), columns=significant_feature_columns)

In [None]:
# Adding the diagnosis column after normalizing the dataset

clean_df['diagnosis'] = new_df['diagnosis']
clean_df.head()

In [None]:
# Creating the csv file from the processed dataframe

clean_df.to_csv('../data/cleaned_data.csv',index=False)