# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [9]:
# import pandas and load cancer data
import pandas as pd
df = pd.read_csv('cancer_data_means.csv')
# check which columns have missing values
print(df.isnull().any())

id                        False
diagnosis                 False
radius_mean               False
texture_mean               True
perimeter_mean            False
area_mean                 False
smoothness_mean            True
compactness_mean          False
concavity_mean            False
concave_points_mean       False
symmetry_mean              True
fractal_dimension_mean    False
dtype: bool


In [11]:
# use the mean to fill in missing values
print(df.fillna(df.mean(), inplace=True))
print('-------------------------------------')
# confirm your correction 
print(df.isnull().any())

None
-------------------------------------
id                        False
diagnosis                 False
radius_mean               False
texture_mean              False
perimeter_mean            False
area_mean                 False
smoothness_mean           False
compactness_mean          False
concavity_mean            False
concave_points_mean       False
symmetry_mean             False
fractal_dimension_mean    False
dtype: bool


  print(df.fillna(df.mean(), inplace=True))


In [12]:
# how many duplicates are there ?
print(df.duplicated().sum())

5


In [13]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [14]:
# confirm correction by rechecking for duplicates in the data
print(df.duplicated().sum())

0


## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [15]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)
df.rename(columns=lambda x: x.replace("_mean", ""), inplace=True)

In [16]:
# display first few rows of the dataframe to confirm changes
print(df.head())

         id diagnosis  radius    texture  perimeter    area  smoothness  \
0    842302         M   17.99  19.293431     122.80  1001.0    0.118400   
1    842517         M   20.57  17.770000     132.90  1326.0    0.084740   
2  84300903         M   19.69  21.250000     130.00  1203.0    0.109600   
3  84348301         M   11.42  20.380000      77.58   386.1    0.096087   
4  84358402         M   20.29  14.340000     135.10  1297.0    0.100300   

   compactness  concavity  concave_points  symmetry  fractal_dimension  
0      0.27760     0.3001         0.14710    0.2419            0.07871  
1      0.07864     0.0869         0.07017    0.1812            0.05667  
2      0.15990     0.1974         0.12790    0.2069            0.05999  
3      0.28390     0.2414         0.10520    0.2597            0.09744  
4      0.13280     0.1980         0.10430    0.1809            0.05883  


In [17]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
df.to_csv("cancer_data_edited.csv", index=False)