In [2]:
import pandas as pd


transactions = pd.DataFrame({
    'TransactionID': [1, 2, 3, 4, 5],
    'CustomerID': [101, 102, 103, 104, 105],
    'ProductID': [1001, 1002, 1001, 1003, 1002],
    'Quantity': [1, 2, 1, 4, 2],
    'Price': [20.0, 15.0, 20.0, 30.0, 15.0],
    'Order Date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']),
    'CustomerLocation': ['NY', 'LA', 'NY', 'SF', 'LA']
})

products = pd.DataFrame({
    'ProductID': [1001, 1002, 1003],
    'ProductName': ['ProductA', 'ProductB', 'ProductC'],
    'Category': ['Electronics', 'Groceries', 'Clothing']
})

merged_data = pd.merge(transactions, products, on='ProductID')
print(merged_data.head())

duplicates = merged_data[merged_data.duplicated()]
print("Duplicate Records:\n", duplicates)


merged_data_cleaned = merged_data.drop_duplicates()
print("Data after removing duplicates:\n", merged_data_cleaned.head())

Q1 = merged_data_cleaned['Price'].quantile(0.25)
Q3 = merged_data_cleaned['Price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

merged_data_no_outliers = merged_data_cleaned[(merged_data_cleaned['Price'] >= lower_bound) &
                                              (merged_data_cleaned['Price'] <= upper_bound)]
print("Data after removing outliers:\n", merged_data_no_outliers.head())

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

merged_data_no_outliers[['Quantity', 'Price']] = scaler.fit_transform(merged_data_no_outliers[['Quantity', 'Price']])
print("Data after Min-Max normalization:\n", merged_data_no_outliers.head())

merged_data_encoded = pd.get_dummies(merged_data_no_outliers, columns=['Category'])
print("Data after one-hot encoding:\n", merged_data_encoded.head())


   TransactionID  CustomerID  ProductID  Quantity  Price Order Date  \
0              1         101       1001         1   20.0 2023-01-01   
1              2         102       1002         2   15.0 2023-01-02   
2              3         103       1001         1   20.0 2023-01-03   
3              4         104       1003         4   30.0 2023-01-04   
4              5         105       1002         2   15.0 2023-01-05   

  CustomerLocation ProductName     Category  
0               NY    ProductA  Electronics  
1               LA    ProductB    Groceries  
2               NY    ProductA  Electronics  
3               SF    ProductC     Clothing  
4               LA    ProductB    Groceries  
Duplicate Records:
 Empty DataFrame
Columns: [TransactionID, CustomerID, ProductID, Quantity, Price, Order Date, CustomerLocation, ProductName, Category]
Index: []
Data after removing duplicates:
    TransactionID  CustomerID  ProductID  Quantity  Price Order Date  \
0              1         101 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_no_outliers[['Quantity', 'Price']] = scaler.fit_transform(merged_data_no_outliers[['Quantity', 'Price']])
