In [239]:
import pandas as pd
import numpy as np
from datetime import date

In [542]:
class Mining:
    def __init__(self, data: str):
        self.data: str = data
        self.raw_df = pd.read_csv(data)
        self.df: pd.DataFrame = self.load_csv(data)
        # self.filtered_age: pd.DataFrame = 
        self._sub_types = np.array(["Both", "Phone", "Email", "Neither"])
        self.sub_df = _
        self.sub_status: dict = self._sub_status()
        self.contact_list = self._contact_list()
                
    def __str__(self):
        return f"Mining data class for the Lifetime dataset, '{self.data}'"
    
    def __repr__(self):
        return f"Mining(data={self.data})"

    @staticmethod    
    def _get_rows(dataframe: pd.DataFrame, sub_type: str) -> pd.DataFrame:
        return dataframe.loc[(dataframe["Subscribed"] == sub_type)]
    
    def load_csv(self, filename: str) -> pd.DataFrame:
        df = pd.read_csv(filename)
        # df = self.sub_status["Both"].copy()
        filtered_age = df[(df["MemberAge"] > 0) & (df["MemberAge"] < 13)]
        filtered_final = filtered_age[
            (filtered_age.DeliveredEmployee != "Bryan Hill") &
            (filtered_age.DeliveredEmployee != "Shane Haberkorn") &
            (filtered_age.DeliveredEmployee != "Marissa Matthies") &
            (filtered_age.DeliveredEmployee != "Katherine Delamore") &
            (filtered_age.DeliveredEmployee != "Steven Pauka")
        ]
        return filtered_final
        
    def unsubscribed(self, sub_type: str) -> pd.DataFrame:
        sub_type = str.capitalize(sub_type)
        sub_type_df = self.sub_status[sub_type].copy()
        return sub_type_df
    
    def family(self, name: str) -> pd.DataFrame:
        return self.df.loc[self.df["ParentName"] == name]
        
    def _sub_status(self) -> dict:
        df = pd.DataFrame(self.df, copy=True)
        
        df["Subscribed"] = "Both"
        df.loc[(df["MembershipPhone"] == "Unsubscribed") & (df["MembershipEmail"] == "Unsubscribed"), "Subscribed"] = "Neither"
        df.loc[(df["MembershipPhone"] != "Unsubscribed") & (df["MembershipEmail"] == "Unsubscribed"), "Subscribed"] = "Phone"
        df.loc[(df["MembershipPhone"] == "Unsubscribed") & (df["MembershipEmail"] != "Unsubscribed"), "Subscribed"] = "Email"
        
        df["Subscribed Cat"] = pd.factorize(df["Subscribed"])[0]
        df_dict: dict = {
            "Both": self._get_rows(df, "Both"),
            "Phone": self._get_rows(df, "Phone"),
            "Email": self._get_rows(df, "Email"),
            "Neither": self._get_rows(df, "Neither")
        }
        self.sub_df = df
        return df_dict
    
    def _contact_list(self):
        df = self.df.iloc[:, :8]
        return df

    def sub_info(self, sub_type: str):
        # TODO: Add more information.
        if sub_type in self._sub_types:
            return self.sub_status[sub_type].describe() 
        else:
            print(f"Error: there is no subscription type, '{sub_type}'. You must use one of the following:\n{self._sub_types}. ")
            
x = Mining("data.csv")

In [467]:
x.contact_list

# Future Work

## Class methods

In [None]:
# TODO: Alter dataframe column datatypes.
    # def _correct_dataframe(self):
    #     types = np.array([
    #         "str", "str", "str", "int8", "date", 
    #         "str", "str", "str", "int64", "date",
    #         "cat", "date", "cat", "str", "int64",
    #         "cat", "date", "cat", "cat", "float32",
    #         "date", "date", "date", "date", "date",
    #         "date", "date", "int8", "boolean", "boolean",
    #         "cat", "cat", "cat", "cat"
    #     ]).T
    #     
    #     col_names = np.array(self.df.columns)
    #     
    #     dtype_df = pd.DataFrame([col_names, types]).T
    #     for row in dtype_df.values:
    #         col_name, col_type = row
    #         
    #     return
    # 
    # def basic_series(self, column_name: str, series_type: str):
    #     series = self.df[column_name]
    #     if series_type == "cat":
    #         cat_arr = get_categories(series)
    #         series = pd.Categorical(series, cat_arr[:,0], ordered=False).codes
    #        
    #     return series.astype(series_type)
    # 
    # def get_categories(self, series: pd.Series) -> np.ndarray:
    #     categories = list(series.unique())
    #     is_nan = []
    #     for cat in categories:
    #         try:
    #             answer = np.isnan(cat)
    #         except TypeError:
    #             answer = False
    #         is_nan.append(answer)
    #     array = np.transpose(np.hstack((categories, is_nan)).reshape(2,len(categories)))
    #     return array

## Functions

In [None]:
def convert_to_date(string: str):
    year, month, day = map(int, string.split("-"))
    date_obj = date(year, month, day)
    return date_obj

def date_series(series: pd.Series, col_name: str):
    # series.astype("str")
    
    temp: list = []
    for row in series:
        new_row = convert_to_date(row)
        temp.append(new_row)
    date_series = pd.Series(temp, name=col_name)
    return date_series

def basic_series(series: pd.Series, series_type: str):
    if series_type == "cat":
        cat_arr = get_categories(series)
        series = pd.Categorical(cat_arr[:,0])
    return series.astype(series_type)

def get_categories(series: pd.Series) -> np.ndarray:
    categories = list(series.unique())
    is_nan = []
    for cat in categories:
        try:
            answer = np.isnan(cat)
        except TypeError:
            answer = False
        is_nan.append(answer)
    array = np.transpose(np.hstack((categories, is_nan)).reshape(2,len(categories)))
    return array

## Preparation for type casting

In [461]:
types = np.array([
    "str", "str", "str", "int8", "date", 
    "str", "str", "str", "int64", "date",
    "cat", "date", "cat", "str", "int64",
    "cat", "date", "cat", "cat", "float32",
    "date", "date", "date", "date", "date",
    "date", "date", "int8", "boolean", "boolean",
    "cat", "cat", "cat", "cat"
]).T
col_names = np.array(dtype_df.iloc[:, 0])

In [545]:
col_names

array(['MembershipID', 'MemberID', 'MemberName', 'MemberAge',
       'MemberJoinDate', 'ParentName', 'MembershipPhone',
       'MembershipEmail', 'PackageID', 'PackageLoadDate',
       'PackageLoadClub', 'DeliveredDate', 'DeliveredClub',
       'DeliveredEmployee', 'R_MembershipID', 'Description', 'SaleDate',
       'SaleClub', 'Product', 'Price', 'AddDate', 'ActivationDate',
       'HoldBeginDate', 'HoldEndDate', 'CancelRequestDate',
       'TerminationDate', 'LastUpdatedDate', 'Opportunity', 'Conversion',
       'Close', 'Club', 'Area', 'RM', 'State Abbreviation'], dtype=object)