# Step - 1 :- Data Cleaning

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [4]:
# Creating a manual dataset
data = pd.DataFrame({
    'name': ['John', 'Jane', 'Jack', 'John', None],
    'age': [28, 34, None, 28, 22],
    'purchase_amount': [100.5, None, 85.3, 100.5, 50.0],
    'date_of_purchase': ['2023/12/01', '2023/12/02', '2023/12/01', '2023/12/01', '2023/12/03']
	})

# Handling missing values using mean imputation for 'age' and 'purchase_amount'
imputer = SimpleImputer(strategy='mean')
data[['age', 'purchase_amount']] = imputer.fit_transform(data[['age', 'purchase_amount']])

# Removing duplicate rows
data = data.drop_duplicates()

# Correcting inconsistent date formats
data['date_of_purchase'] = pd.to_datetime(data['date_of_purchase'], errors='coerce')
print("/* ----------------------------------- Data Cleaning -----------------------------------*/")
print(data)

/* ----------------------------------- Data Cleaning -----------------------------------*/
   name   age  purchase_amount date_of_purchase
0  John  28.0          100.500       2023-12-01
1  Jane  34.0           84.075       2023-12-02
2  Jack  28.0           85.300       2023-12-01
4  None  22.0           50.000       2023-12-03


# Step 2: Data integration

In [6]:
data1  = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, 35, None, 40],
    'purchase_amount': [200.0, None, 150.0, 300.0, 250.0],
    'date_of_purchase': ['2023/12/01', '2023/12/02', '2023/12/03', '2023/12/04', '2023/12/05']
})

data2 = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, 35, None, 40],
    'purchase_amount': [200.0, None, 150.0, 300.0, 250.0],
    'date_of_purchase': ['2023/12/01', '2023/12/02', '2023/12/03', '2023/12/04', '2023/12/05']
})

# Merging two datasets
merged_data  = pd.merge(data1,data2,on ='customer_id', how='inner')
print("/* ----------------------------------- Merged Data -----------------------------------*/")
print(merged_data)

/* ----------------------------------- Merged Data -----------------------------------*/
   customer_id   name_x  age_x  purchase_amount_x date_of_purchase_x   name_y  \
0            1    Alice   25.0              200.0         2023/12/01    Alice   
1            2      Bob   30.0                NaN         2023/12/02      Bob   
2            3  Charlie   35.0              150.0         2023/12/03  Charlie   
3            4    David    NaN              300.0         2023/12/04    David   
4            5      Eve   40.0              250.0         2023/12/05      Eve   

   age_y  purchase_amount_y date_of_purchase_y  
0   25.0              200.0         2023/12/01  
1   30.0                NaN         2023/12/02  
2   35.0              150.0         2023/12/03  
3    NaN              300.0         2023/12/04  
4   40.0              250.0         2023/12/05  


# Step 3: Data transformation


In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Creating a manual dataset
data = pd.DataFrame({
    'category': ['A', 'B', 'A', 'C', 'B'],
    'numeric_column': [10, 15, 10, 20, 15]
	})

# Scaling numeric data
scaler = StandardScaler()
data['scaled_numeric_column'] = scaler.fit_transform(data[['numeric_column']])

# Encoding categorical variables using one-hot encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_data = pd.DataFrame(encoder.fit_transform(data[['category']]),
                            columns=encoder.get_feature_names_out(['category']))

# Concatenating the encoded data with the original dataset
data = pd.concat([data, encoded_data], axis=1)
print("/* ----------------------------------- Encoded Data -----------------------------------*/")
print(data)

  category  numeric_column  scaled_numeric_column  category_A  category_B  \
0        A              10              -1.069045         1.0         0.0   
1        B              15               0.267261         0.0         1.0   
2        A              10              -1.069045         1.0         0.0   
3        C              20               1.603567         0.0         0.0   
4        B              15               0.267261         0.0         1.0   

   category_C  
0         0.0  
1         0.0  
2         0.0  
3         1.0  
4         0.0  
