In [47]:
import numpy as np 
import matplotlib as plt
import pandas as pd
import seaborn as sns

In [48]:
penguins_df = pd.read_csv("noisy_datasets/penguins.csv")
diamonds_df = pd.read_csv("noisy_datasets/diamond.csv")
epicurious_df = pd.read_csv("noisy_datasets/epicurious.csv")

penguins_df.head()

Unnamed: 0,species,island,calorie requirement,average sleep duration,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,gender,year
0,Adelie,Torgersen,6563,11,39.1,18.7,181.0,3750.0,male,2007.0
1,Adelie,Torgersen,4890,14,39.5,17.4,186.0,3800.0,female,2007.0
2,Adelie,Torgersen,7184,11,40.3,18.0,195.0,3250.0,female,2007.0
3,Adelie,Torgersen,4169,8,,,,992.0,,2007.0
4,Adelie,Torgersen,4774,8,36.7,19.3,193.0,3450.0,female,2007.0


In [49]:
print("Shape of diamonds dataset before dropping na values: ",diamonds_df.shape)
diamonds_df = diamonds_df.dropna()
print("Shape of diamonds dataset after dropping na values: ",diamonds_df.shape)

Shape of diamonds dataset before dropping na values:  (53940, 13)
Shape of diamonds dataset after dropping na values:  (49075, 13)


In [50]:
# Converting the string values in numerical columns to NA and dropping the rows
num_cols = ['Unnamed: 0', 'carat', 'average us salary', 'number of diamonds mined (millions)', 'depth',	'table', 'price', 'x', 'y', 'z']
for cols in num_cols:
    diamonds_df[cols] = pd.to_numeric(diamonds_df[cols], errors = 'coerce')
diamonds_df = diamonds_df.dropna()
print("Shape of diamonds dataset after dropping na values: ",diamonds_df.shape)

Shape of diamonds dataset after dropping na values:  (48822, 13)


In [51]:
print(diamonds_df.isnull().sum())
# No missing values in the dataset is found
string_cols = []
for cols in diamonds_df.columns:
    if diamonds_df[cols].dtype == 'object':
        string_cols.append(cols)
# Converting values of all columns with object datatype to lowercase
diamonds_df[string_cols] = diamonds_df[string_cols].apply(lambda x: x.str.lower())
diamonds_df

Unnamed: 0                             0
carat                                  0
cut                                    0
color                                  0
clarity                                0
average us salary                      0
number of diamonds mined (millions)    0
depth                                  0
table                                  0
price                                  0
x                                      0
y                                      0
z                                      0
dtype: int64


Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,average us salary,number of diamonds mined (millions),depth,table,price,x,y,z
0,1.0,0.23,ideal,e,si2,31282,5.01,61.5,55.0,326.0,3.95,3.98,2.43
1,2.0,0.21,premium,e,si1,40049,1.69,59.8,61.0,326.0,3.89,3.84,2.31
2,3.0,0.23,good,e,vs1,33517,3.85,56.9,65.0,327.0,4.05,4.07,2.31
3,4.0,0.29,premium,i,vs2,38495,3.49,62.4,58.0,334.0,4.20,4.23,2.63
4,5.0,0.31,good,j,si2,34178,4.70,63.3,58.0,335.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,53936.0,0.72,ideal,d,si1,43892,3.54,60.8,57.0,2757.0,5.75,5.76,3.50
53936,53937.0,0.72,good,d,si1,35532,3.51,63.1,55.0,2757.0,5.69,5.75,3.61
53937,53938.0,0.70,very good,d,si1,32133,3.27,62.8,60.0,2757.0,5.66,5.68,3.56
53938,53939.0,0.86,premium,h,si2,44248,4.03,61.0,58.0,2757.0,6.15,6.12,3.74


In [52]:
# detecting outliers and replacing them with the median of the column
diamond_df_cols = ['carat', 'average us salary', 'number of diamonds mined (millions)', 'depth', 'table', 
       'price', 'x', 'y', 'z']

for cols in diamond_df_cols:
    if diamonds_df[cols].dtype != 'object':
        q1 = diamonds_df[cols].quantile(0.25)
        q3 = diamonds_df[cols].quantile(0.75)
        iqr = q3 - q1
        lb = q1 - 1.5*iqr
        ub = q3 + 1.5*iqr
        outliers = diamonds_df[(diamonds_df[cols] < lb) | (diamonds_df[cols] > ub)]
        print(f"Outliers of Column {cols}: ", outliers)
        if not outliers.empty:
            median1 = diamonds_df[cols].median()
            diamonds_df[cols] = diamonds_df[cols].apply(lambda x: median1 if (x < lb or x > ub) else x)
print('Diamonds datset after replacing outliers with median values: ', diamonds_df.head(10))

Outliers of Column carat:         Unnamed: 0  carat      cut color clarity  average us salary  \
12246     12247.0   2.06  premium     j      i1              40497   
13002     13003.0   2.14     fair     j      i1              31965   
13118     13119.0   2.15     fair     j      i1              42063   
13757     13758.0   2.22     fair     j      i1              46284   
13991     13992.0   2.01     fair     i      i1              46942   
...           ...    ...      ...   ...     ...                ...   
27741     27742.0   2.15    ideal     g     si2              31876   
27742     27743.0   2.04  premium     h     si1              32337   
27744     27745.0   2.29  premium     i     si1              44106   
27746     27747.0   2.07    ideal     g     si2              38530   
27749     27750.0   2.29  premium     i     vs2              39679   

       number of diamonds mined (millions)  depth  table    price     x     y  \
12246                                 0.90   61.2  