## Data Wrangling in Pandas
 ![text_here](images/img_one.jpg)  




## Data Exploration

In [None]:
import pandas as pd
pd.options.display.max_columns =None
pd.options.display.max_rows =40


filename = 'data/breast_cancer_data.csv'
    
df = pd.read_csv(filename)

In [None]:
#start of by actually looking at your data set
df

In [None]:
# What is the size of our dataset?
df.shape

In [None]:
# Over here we see the columns names and their data types
df.dtypes

In [None]:
#Its good to inspect your unique key identifier
df.nunique()

In [None]:
# Here we list all columns
df.columns

In [None]:
# This provides some statistics on the numerical data
df.describe()

In [None]:
# This aggreates the data by its column names, then we pass the aggregation function (size = count)
df.groupby(by =['class', 'doctor_name']).size()

## Data Preproccessing

In [None]:
#Dealing with missing values? How many np.nan per column?

df.isna().sum() 

In [None]:
# # fill with zero
# df = df.fillna(0) 

In [None]:
df = df.dropna(axis = 1, how = 'all')  #drop rows with any column having np.nan values

#Rename columns
df.rename(index =str, columns = {'patient_id':'patient_id'})

In [None]:
# Its good to inspect unique key identifiers
df.nunique()

In [None]:
# This shows rows that show up more than once and have the exact same column values. 
df[df.duplicated(keep = 'last')]

# # This shows all instances where pantient_id shows up more than once, but may have varying column values
# df[df.duplicated(subset = 'patient_id', keep =False)].sort_values('patient_id')

In [None]:
#Now that I have seen that there are some duplicates, I am going to go ahead and remove any duplicate rows
#, same things that occours twice

df = df.drop_duplicates(subset = None, keep ='first')

In [None]:
repeat_patients = df.groupby(by = 'patient_id').size().sort_values(ascending =False)
repeat_patients

In [None]:
# How to reverse conditionality?
print(1==1)
print(~1==1)

In [None]:
filtered_patients = repeat_patients[repeat_patients > 2].to_frame().reset_index()
filtered_df = df[~df.patient_id.isin(filtered_patients.patient_id)]
filtered_df

In [None]:
# This is all the repeating patients details

df[df.patient_id.isin(filtered_patients.patient_id)]

In [None]:
# How to view the data by aggeregting on more than one column

df.groupby('class').agg({'cell_size_uniformity': ['min', 'max'], 'normal_nucleoli': 'mean', 'class': 'count'})

### One Hot Encoding Catergorical Data

In [None]:
categorical_df = df[['patient_id', 'doctor_name']]

In [None]:
# This specifies all rows (':') and column name 'doctor_count'
categorical_df.loc[:,'doctor_count'] = 1

In [None]:
categorical_df

In [None]:
doctors_one_hot_encoded  = pd.pivot_table(categorical_df
                                  ,index = categorical_df.index, 
                                  columns = ['doctor_name'], values = ['doctor_count'])

In [None]:
doctors_one_hot_encoded = doctors_one_hot_encoded.fillna(0)
doctors_one_hot_encoded

In [None]:
doctors_one_hot_encoded.columns = doctors_one_hot_encoded.columns.droplevel()
doctors_one_hot_encoded

In [None]:
combined_df = pd.merge(df, doctors_one_hot_encoded, left_index = True,right_index =True, how ='left')
combined_df

## Making new columns and conducting elementise operations

In [None]:
#Randomly sampling 10 rows
combined_df.sample(n=10)

In [None]:
combined_df.drop(columns=['doctor_name'])

In [None]:
#Making a new column based on a nuemrical calcualtion of other columns in the df
df['new_col_name'] = df['clump_thickness']*df['cell_size_uniformity']


In [None]:
# How to convert benign & malingant to 0 and 1

class_to_numerical_dictionary = {'benign':0, 'malignant':1}

combined_df['class'] = combined_df['class'].map(class_to_numerical_dictionary)

combined_df


In [None]:
# Feature building: 

def celltypelabel(x):
    if ((x['cell_size_uniformity'] > 5) & (x['cell_shape_uniformity'] > 5)):
        return('normal')
    else:
        return('abnormal')


combined_df['cell_type_label'] = combined_df.apply(lambda x: celltypelabel(x), axis=1)

        

In [None]:
combined_df[['patient_id', 'cell_type_label']]

In [None]:
combined_df[~(combined_df.cell_size_uniformity >5) & (combined_df.cell_shape_uniformity >5)]