## Data Wrangling in Pandas
 ![text_here](images/img_one.jpg)  




## Data Exploration

In [1]:
import pandas as pd
pd.options.display.max_columns =None
pd.options.display.max_rows =40


filename = 'data/breast_cancer_data.csv'
    
df = pd.read_csv(filename)

In [2]:
#start of by actually looking at your data set
df

Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
5,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith
6,1018099,1.0,,1,1,2,10,3.0,1.0,1,benign,Dr. Doe
7,1018561,2.0,1.0,2,1,2,1,3.0,1.0,1,benign,Dr. Smith
8,1033078,2.0,1.0,1,1,2,1,1.0,1.0,5,benign,Dr. Smith
9,1033078,4.0,2.0,1,1,2,1,2.0,1.0,1,benign,Dr. Doe


In [3]:
# What is the size of our dataset?
df.shape

(699, 12)

In [4]:
# Over here we see the columns names and their data types
df.dtypes

patient_id                 int64
clump_thickness          float64
cell_size_uniformity     float64
cell_shape_uniformity      int64
marginal_adhesion          int64
single_ep_cell_size        int64
bare_nuclei               object
bland_chromatin          float64
normal_nucleoli          float64
mitoses                    int64
class                     object
doctor_name               object
dtype: object

In [5]:
#Its good to inspect your unique key identifier
df.nunique()

patient_id               645
clump_thickness           10
cell_size_uniformity      10
cell_shape_uniformity     10
marginal_adhesion         10
single_ep_cell_size       10
bare_nuclei               11
bland_chromatin           10
normal_nucleoli           10
mitoses                    9
class                      2
doctor_name                4
dtype: int64

In [6]:
# Here we list all columns
df.columns

Index([u'patient_id', u'clump_thickness', u'cell_size_uniformity',
       u'cell_shape_uniformity', u'marginal_adhesion', u'single_ep_cell_size',
       u'bare_nuclei', u'bland_chromatin', u'normal_nucleoli', u'mitoses',
       u'class', u'doctor_name'],
      dtype='object')

In [7]:
# This provides some statistics on the numerical data
df.describe()

Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bland_chromatin,normal_nucleoli,mitoses
count,699.0,698.0,698.0,699.0,699.0,699.0,695.0,698.0,699.0
mean,1071704.0,4.416905,3.137536,3.207439,2.793991,3.216023,3.447482,2.868195,1.589413
std,617095.7,2.817673,3.052575,2.971913,2.843163,2.2143,2.441191,3.055647,1.715078
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0
75%,1238298.0,6.0,5.0,5.0,3.5,4.0,5.0,4.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [8]:
# This aggreates the data by its column names, then we pass the aggregation function (size = count)
df.groupby(by =['class', 'doctor_name']).size()

class      doctor_name
benign     Dr. Doe        127
           Dr. Lee        121
           Dr. Smith      102
           Dr. Wong       108
malignant  Dr. Doe         58
           Dr. Lee         60
           Dr. Smith       74
           Dr. Wong        49
dtype: int64

## Data Preproccessing

In [9]:
#Dealing with missing values? How many np.nan per column?

df.isna().sum() 

patient_id               0
clump_thickness          1
cell_size_uniformity     1
cell_shape_uniformity    0
marginal_adhesion        0
single_ep_cell_size      0
bare_nuclei              2
bland_chromatin          4
normal_nucleoli          1
mitoses                  0
class                    0
doctor_name              0
dtype: int64

In [10]:
# # fill with zero
# df = df.fillna(0) 

In [11]:
df = df.dropna(axis = 1, how = 'all')  #drop rows with any column having np.nan values

#Rename columns
df.rename(index =str, columns = {'patient_id':'patient_id'})

Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
5,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith
6,1018099,1.0,,1,1,2,10,3.0,1.0,1,benign,Dr. Doe
7,1018561,2.0,1.0,2,1,2,1,3.0,1.0,1,benign,Dr. Smith
8,1033078,2.0,1.0,1,1,2,1,1.0,1.0,5,benign,Dr. Smith
9,1033078,4.0,2.0,1,1,2,1,2.0,1.0,1,benign,Dr. Doe


In [12]:
# Its good to inspect unique key identifiers
df.nunique()

patient_id               645
clump_thickness           10
cell_size_uniformity      10
cell_shape_uniformity     10
marginal_adhesion         10
single_ep_cell_size       10
bare_nuclei               11
bland_chromatin           10
normal_nucleoli           10
mitoses                    9
class                      2
doctor_name                4
dtype: int64

In [13]:
# This shows rows that show up more than once and have the exact same column values. 
df[df.duplicated(keep = 'last')]

# # This shows all instances where pantient_id shows up more than once, but may have varying column values
# df[df.duplicated(subset = 'patient_id', keep =False)].sort_values('patient_id')

Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
168,1198641,3.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Lee


In [14]:
#Now that I have seen that there are some duplicates, I am going to go ahead and remove any duplicate rows
#, same things that occours twice

df = df.drop_duplicates(subset = None, keep ='first')

In [15]:
repeat_patients = df.groupby(by = 'patient_id').size().sort_values(ascending =False)
repeat_patients

patient_id
1182404    6
1276091    5
1105524    2
1299596    2
385103     2
734111     2
411453     2
1143978    2
1218860    2
822829     2
1240603    2
1070935    2
466906     2
1299924    2
1238777    2
1321942    2
1114570    2
798429     2
493452     2
1115293    2
          ..
1206841    1
1207986    1
1208301    1
1210963    1
1217051    1
1216947    1
1216694    1
1214966    1
1214556    1
1214092    1
1213784    1
1213383    1
1213375    1
1213273    1
1212251    1
1212232    1
1211594    1
1211265    1
1211202    1
61634      1
Length: 645, dtype: int64

In [16]:
# How to reverse conditionality?
print(1==1)
print(~1==1)

True
False


In [17]:
filtered_patients = repeat_patients[repeat_patients > 2].to_frame().reset_index()
filtered_df = df[~df.patient_id.isin(filtered_patients.patient_id)]
filtered_df

Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
5,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith
6,1018099,1.0,,1,1,2,10,3.0,1.0,1,benign,Dr. Doe
7,1018561,2.0,1.0,2,1,2,1,3.0,1.0,1,benign,Dr. Smith
8,1033078,2.0,1.0,1,1,2,1,1.0,1.0,5,benign,Dr. Smith
9,1033078,4.0,2.0,1,1,2,1,2.0,1.0,1,benign,Dr. Doe


In [18]:
# This is all the repeating patients details

df[df.patient_id.isin(filtered_patients.patient_id)]

Unnamed: 0,patient_id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
136,1182404,4.0,1.0,1,1,2,1,2.0,1.0,1,benign,Dr. Lee
241,1276091,3.0,1.0,1,3,1,1,3.0,1.0,1,benign,Dr. Wong
256,1182404,3.0,1.0,1,1,2,1,1.0,1.0,1,benign,Dr. Wong
257,1182404,3.0,1.0,1,1,2,1,2.0,1.0,1,benign,Dr. Doe
265,1182404,5.0,1.0,4,1,2,1,3.0,2.0,1,benign,Dr. Lee
429,1276091,2.0,1.0,1,1,2,1,2.0,1.0,1,benign,Dr. Doe
430,1276091,1.0,3.0,1,1,2,1,2.0,2.0,1,benign,Dr. Wong
431,1276091,5.0,1.0,1,3,4,1,3.0,2.0,1,benign,Dr. Wong
448,1182404,1.0,1.0,1,1,1,1,1.0,1.0,1,benign,Dr. Lee
462,1276091,6.0,1.0,1,3,2,1,1.0,1.0,1,benign,Dr. Lee


In [None]:
# How to view the data by aggeregting on more than one column

df.groupby('class').agg({'cell_size_uniformity': ['min', 'max'], 'normal_nucleoli': 'mean', 'class': 'count'})

### One Hot Encoding Catergorical Data

In [None]:
categorical_df = df[['patient_id', 'doctor_name']]

In [None]:
# This specifies all rows (':') and column name 'doctor_count'
categorical_df.loc[:,'doctor_count'] = 1

In [None]:
categorical_df

In [None]:
doctors_one_hot_encoded  = pd.pivot_table(categorical_df
                                  ,index = categorical_df.index, 
                                  columns = ['doctor_name'], values = ['doctor_count'])

In [None]:
doctors_one_hot_encoded = doctors_one_hot_encoded.fillna(0)
doctors_one_hot_encoded

In [None]:
doctors_one_hot_encoded.columns = doctors_one_hot_encoded.columns.droplevel()
doctors_one_hot_encoded

In [None]:
combined_df = pd.merge(df, doctors_one_hot_encoded, left_index = True,right_index =True, how ='left')
combined_df

## Making new columns and conducting elementise operations

In [None]:
#Randomly sampling 10 rows
combined_df.sample(n=10)

In [None]:
combined_df.drop(columns=['doctor_name'])

In [None]:
#Making a new column based on a nuemrical calcualtion of other columns in the df
df['new_col_name'] = df['clump_thickness']*df['cell_size_uniformity']


In [None]:
# How to convert benign & malingant to 0 and 1

class_to_numerical_dictionary = {'benign':0, 'malignant':1}

combined_df['class'] = combined_df['class'].map(class_to_numerical_dictionary)

combined_df


In [None]:
# Feature building: 

def celltypelabel(x):
    if ((x['cell_size_uniformity'] > 5) & (x['cell_shape_uniformity'] > 5)):
        return('normal')
    else:
        return('abnormal')


combined_df['cell_type_label'] = combined_df.apply(lambda x: celltypelabel(x), axis=1)

        

In [None]:
combined_df[['patient_id', 'cell_type_label']]

In [None]:
combined_df[~(combined_df.cell_size_uniformity >5) & (combined_df.cell_shape_uniformity >5)]