<a href="https://colab.research.google.com/github/NicKostii/UniProjects/blob/main/DataCleaner_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Instructions:

Fill in the methods of the DataCleaner class to produce the same printed results
as in the comments below. Good luck, and have fun!
"""
from __future__ import annotations
import os
import numpy as np
import pandas as pd
import pickle

class DataCleaner:
    def __init__(self, df: pd.DataFrame):
        self.history = [("Initial df", df.copy())]
        self.current = df.copy()

    def __del__(self):
        if os.path.exists("transactions.pkl"):
                os.remove("transactions.pkl")

    def adjust_dtype(self, dtypes: dict):
        for column, dtype in dtypes.items():
            if dtype == np.datetime64:
                self.current[column] = pd.to_datetime(self.current[column])
            else:
                self.current[column] = self.current[column].astype(dtype)
        self.history.append((f"Adjusted dtypes using {dtypes}", self.current.copy()))

    def impute_missing(self, columns: list[str]):
        for column in columns:
            mean_value = self.current[column].mean()
            self.current[column].fillna(mean_value, inplace=True)
        self.history.append((f"Imputed missing in {columns}", self.current.copy()))


    def save(self, filename: str):
        with open(f"{filename}.pkl", "wb") as f:
            pickle.dump(self, f)

    def load(filename: str) -> DataCleaner:
        with open(f"{filename}.pkl", "rb") as f:
            return pickle.load(f)

    def revert(self):
        if len(self.history) > 1:
            self.history.pop()
            self.current = self.history[-1][1].copy()


transactions = pd.DataFrame(
    {
        "customer_id": [10, 10, 13, 10, 11, 11, 10],
        "amount": [1.00, 1.31, 20.5, 0.5, 0.2, 0.2, np.nan],
        "timestamp": [
            "2020-10-08 11:32:01",
            "2020-10-08 13:45:00",
            "2020-10-07 05:10:30",
            "2020-10-08 12:30:00",
            "2020-10-07 01:29:33",
            "2020-10-08 13:45:00",
            "2020-10-09 02:05:21",
        ]
    }
)
transactions_dc = DataCleaner(transactions)

In [2]:
print(f"Current dataframe:\n{transactions_dc.current}")

Current dataframe:
   customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21


In [3]:
print(f"Current dtypes:\n{transactions_dc.current.dtypes}")

Current dtypes:
customer_id      int64
amount         float64
timestamp       object
dtype: object


In [4]:
transactions_dc.adjust_dtype({"timestamp": np.datetime64})

print(f"Changed dtypes to:\n{transactions_dc.current.dtypes}")

Changed dtypes to:
customer_id             int64
amount                float64
timestamp      datetime64[ns]
dtype: object


In [5]:
transactions_dc.impute_missing(columns=["amount"])

print(f"Imputed missing as overall mean:\n{transactions_dc.current}")

Imputed missing as overall mean:
   customer_id     amount           timestamp
0           10   1.000000 2020-10-08 11:32:01
1           10   1.310000 2020-10-08 13:45:00
2           13  20.500000 2020-10-07 05:10:30
3           10   0.500000 2020-10-08 12:30:00
4           11   0.200000 2020-10-07 01:29:33
5           11   0.200000 2020-10-08 13:45:00
6           10   3.951667 2020-10-09 02:05:21


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.current[column].fillna(mean_value, inplace=True)


In [7]:
print(f"History of changes:\n{transactions_dc.history}")

History of changes:
[('Initial df',    customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21), ("Adjusted dtypes using {'timestamp': <class 'numpy.datetime64'>}",    customer_id  amount           timestamp
0           10    1.00 2020-10-08 11:32:01
1           10    1.31 2020-10-08 13:45:00
2           13   20.50 2020-10-07 05:10:30
3           10    0.50 2020-10-08 12:30:00
4           11    0.20 2020-10-07 01:29:33
5           11    0.20 2020-10-08 13:45:00
6           10     NaN 2020-10-09 02:05:21)]


In [9]:
transactions_dc.save("transactions")
loaded_dc = DataCleaner.load("transactions")
print(f"Loaded DataCleaner current df:\n{loaded_dc.current}")

Loaded DataCleaner current df:
   customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21


In [11]:
transactions_dc.revert()
print(f"Reverting missing value imputation:\n{transactions_dc.current}")

Reverting missing value imputation:
   customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21


In [12]:
def solution(transactions):
    """Do not change. Could be:
    "timestamp": np.datetime64 or "timestamp": "datetime64[ns]"
    depends on realisation
    """
    rez = []

    transactions_dc = DataCleaner(transactions)
    rez.append(f"Current dataframe:\n{transactions_dc.current}")
    rez.append(f"Current dtypes:\n{transactions_dc.current.dtypes}")
    # transactions_dc.adjust_dtype({"timestamp": np.datetime64})
    transactions_dc.adjust_dtype({"timestamp": "datetime64[ns]"})
    rez.append(f"Changed dtypes to:\n{transactions_dc.current.dtypes}")

    transactions_dc.impute_missing(columns=["amount"])
    rez.append(f"Changed dtypes to:\n{transactions_dc.current.dtypes}")
    rez.append(f"Imputed missing as overall mean:\n{transactions_dc.current}")
    rez.append(f"History of changes:\n{transactions_dc.history}")

    transactions_dc.save("transactions")
    loaded_dc = DataCleaner.load("transactions")
    rez.append(f"Loaded DataCleaner current df:\n{loaded_dc.current}")

    transactions_dc.revert()
    rez.append(
        f"Reverting missing value imputation:\n{transactions_dc.current}"
    )
    return rez


if __name__ == "__main__":
    pass