In [5]:
pip install scikit-learn


Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/4e/ba/ce9bd1cd4953336a0e213b29cb80bb11816f2a93de8c99f88ef0b446ad0c/scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/81/d7/d2537d51efb692d0c411e64267ba349e7668d40f5bc73cefe78ccd650dcd/scipy-1.11.3-cp311-cp311-win_amd64.whl.metadata
  Downloading scipy-1.11.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.4 kB ? eta -:--:--
     ------------------- ------------------ 30.7/60.4 kB 435.7 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/60.4 kB 326.8 kB/s eta 0:00:01
     -------------------------------------- 60.4/60

In [3]:
pip install pandas


Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/db/3e/db3e98911b5da217d1e3f85b6e091448cb8f8be674bdaff3c0ec0dd855e0/pandas-2.1.2-cp311-cp311-win_amd64.whl.metadata
  Downloading pandas-2.1.2-cp311-cp311-win_amd64.whl.metadata (18 kB)
Collecting numpy<2,>=1.23.2 (from pandas)
  Obtaining dependency information for numpy<2,>=1.23.2 from https://files.pythonhosted.org/packages/82/0f/3f712cd84371636c5375d2dd70e7514d264cec6bdfc3d7997a4236e9f948/numpy-1.26.1-cp311-cp311-win_amd64.whl.metadata
  Downloading numpy-1.26.1-cp311-cp311-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.2 kB ? eta -:--:--
     ------ --------------------------------- 10.2/61.2 kB ? eta -:--:--
     ------------------- ------------------ 30.7/61.2 kB 330.3 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.2 kB 327.7 kB/s eta 0:00:01
     -------------------------------------- 61.2/61.2 kB 364.0 kB/s eta 0

In [7]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder 
import numpy as np 

In [8]:


def drop_duplicates(df, subset_name): 
	df.drop_duplicates(subset=[subset_name], inplace=True) 
	return df 

def encode(df, column_to_encode): 
	le = LabelEncoder() 
	# fit and transform a column using the LabelEncoder 
	df[column_to_encode] = le.fit_transform(df[column_to_encode]) 
	return df 

def outlier_handling(df, column_with_outliers): 
	q1 = df[column_with_outliers].quantile(0.25) 
	q3 = df[column_with_outliers].quantile(0.75) 
	iqr = q3 - q1 
	# remove outliers 
	df = df[(df[column_with_outliers] > (q1 - 1.5 * iqr)) 
			& (df[column_with_outliers] < (q3 + 1.5 * iqr))] 
	return df 

def date_formatting(df, column_with_date): 
	# format date column 
	df[column_with_date] = pd.to_datetime(df[column_with_date], 
										format='%m/%d/%Y') 
	return df 

def remove_missing_values(df): 
	# Find missing values 
	missing_values = df.isnull().sum() 
	# Remove rows with missing values 
	df = df.dropna() 
	# Print number of missing values removed 
	print("Removed {} missing values".format(missing_values.sum())) 
	return df 


def data_cleaning_pipeline(df_path, 
						duplication_subset, 
						column_to_encode, 
						column_with_outliers, 
						column_with_date): 
	df = pd.read_csv(df_path) 
	df_no_duplicates = drop_duplicates(df, duplication_subset) 
	df_encoded = encode(df_no_duplicates , column_to_encode) 
	df_no_outliers = outlier_handling(df_encoded, column_with_outliers) 
	df_date_formatted = date_formatting(df_no_outliers, column_with_date) 
	df_no_nulls = remove_missing_values(df_date_formatted) 
	return df_no_nulls 

# Create a sample DataFrame 
data = {'Name': ['John', 'Jane', 'Bob', 'John', 'Alice'], 
		'Age': [30, 25, 40, 30, np.NaN], 
		'Gender': ['Male', 'Female', 'Male', 'Male', 'Female'], 
		'Income': [50000, 60000, 70000, 45000, 80000], 
		'Birthdate': ['01/01/1990', '02/14/1996', '03/15/1981', 
					'01/01/1990', '06/30/1986'], 
		'Married': [True, False, True, False, True], 
		'Children': [2, 0, 1, 0, 3]} 
df = pd.DataFrame(data) 
print('Before Preprocessing:\n',df) 
# Save DataFrame as CSV file 
df.to_csv('my_data.csv', index=False) 
	
clean_df = data_cleaning_pipeline('my_data.csv', 
								'Name', 
								'Gender', 
								'Income', 
								'Birthdate') 

print('\nAfter preprocessing') 
clean_df.head()


Before Preprocessing:
     Name   Age  Gender  Income   Birthdate  Married  Children
0   John  30.0    Male   50000  01/01/1990     True         2
1   Jane  25.0  Female   60000  02/14/1996    False         0
2    Bob  40.0    Male   70000  03/15/1981     True         1
3   John  30.0    Male   45000  01/01/1990    False         0
4  Alice   NaN  Female   80000  06/30/1986     True         3
Removed 1 missing values

After preprocessing


Unnamed: 0,Name,Age,Gender,Income,Birthdate,Married,Children
0,John,30.0,1,50000,1990-01-01,True,2
1,Jane,25.0,0,60000,1996-02-14,False,0
2,Bob,40.0,1,70000,1981-03-15,True,1


In [9]:
def clean_data(dataframe):
    # Drop duplicates
    dataframe = dataframe.drop_duplicates()
    
    # Remove rows with missing values
    dataframe = dataframe.dropna()
    
    return dataframe
