In [115]:
import pandas as pd
import glob
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import manipulating_data as md
from scipy.stats import iqr
import numpy as np
from dateutil import parser

In [116]:
path = glob.glob('**/*dengue_features_train.csv', recursive=True) [0]

In [117]:
deng_train = pd.read_csv(path)

In [118]:
deng_train.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [119]:
class DataSet() :

    """ 
    A class used to represents tabular data.
    which can be used in down-stream predictive modelling.

    Attributes
    ----------
    path : str
        The path to the data file.
    data : pandas DataFrame.
        The data loaded from the given path.

    """
    def __init__(self, path = None, data=None) :

        """ 
        Loads the data from the given path
        and stores path and data as instance attributes.
        The data attribute is of type pandas DataFrame.

        Parameters
        ----------
        path : str
            The path to the data file.
        """
        if (data is not None) and (path is None):
            self.data = data
        elif (data is None) and (path is not None):
            self.path = path
            self.data = pd.read_csv(path)
        else :
            print(f'The path is {path}')
            print(f'The data is {repr(data)}')
            raise ValueError("Either path or data should be given, but not both.")

In [167]:
data = DataSet(data = pd.DataFrame([1, 2, 3]))
data.data.head()

Unnamed: 0,0
0,1
1,2
2,3


In [121]:
class CategoricalVariable() :

    """ 
    A class used to represent a categorical variable from some dataset.

    Attributes
    ----------
    column : pandas Series.
        A column from a pandas DataFrame or a standalone column.
    
    Methods
    -------
    ordinal_encode(column: pd.Series, show_mapping=False) -> pd.Series
        Encodes the given column using ordinal encoding.
    
    one_hot_encode(column: pd.Series, show_mapping=False) -> pd.DataFrame
        Encodes the given column using one-hot encoding.
    """


    def __init__(self, column : pd.Series) :
        self.column = column
    

        
    @staticmethod
    def ordinal_encode(column: pd.Series, show_mapping=False) -> pd.Series:
        encoder = OrdinalEncoder()
        encoder_fitted = encoder.fit(pd.DataFrame(column))
        encoded_data = encoder_fitted.transform(pd.DataFrame(column))
        inverse_transformation = encoder_fitted.inverse_transform(encoded_data)
            
        if show_mapping:
            values_mapping = { e.tolist()[0] : t.tolist() for t, e in\
                                        zip(encoded_data, inverse_transformation) }
            return values_mapping
            
        return pd.Series(encoded_data.flatten(), index=column.index, name=column.name)
        

    @staticmethod
    def one_hot_encode(column: pd.Series, show_mapping = False) -> pd.DataFrame:

        encoder = OneHotEncoder(sparse=False)
        fitted_encoder = encoder.fit(pd.DataFrame(column))
        transformed = fitted_encoder.transform(pd.DataFrame(column))
        inverse_transformation = encoder.inverse_transform(transformed)

        if show_mapping :

            values_to_representations = { i.tolist()[0] : t.tolist() for t, i in\
                                        zip(transformed, inverse_transformation) }
            
            return values_to_representations
        
        
        return pd.DataFrame(transformed, columns=fitted_encoder.get_feature_names_out())

    
    def encode_data(self, method, show_mapping=False) -> pd.DataFrame:
        if method == 'ordinal':
            encoded_df = CategoricalVariable.ordinal_encode(self.column, show_mapping=show_mapping)
        elif method == 'one_hot':
            encoded_df = CategoricalVariable.one_hot_encode(self.column, show_mapping=show_mapping)
        else:
            raise ValueError(f"Encoding method {method} not recognized.")
        return encoded_df 



In [122]:
categorical_col = CategoricalVariable(deng_train['city'])

In [123]:
categorical_col.column

0       sj
1       sj
2       sj
3       sj
4       sj
        ..
1451    iq
1452    iq
1453    iq
1454    iq
1455    iq
Name: city, Length: 1456, dtype: object

In [124]:
encoded_data = categorical_col.encode_data('ordinal', show_mapping = True)

In [125]:
encoded_data

{'sj': [1.0], 'iq': [0.0]}

In [126]:
categorical_col.encode_data('ordinal')

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
1451    0.0
1452    0.0
1453    0.0
1454    0.0
1455    0.0
Name: city, Length: 1456, dtype: float64

In [127]:
class DateVariable() :

    """
    A class used to represent a date variable from some dataset.
    
    Attributes
    ----------
    column : pandas Series.
        A column from a pandas DataFrame or a standalone column.
    
    Static Methods
    -------
    parse_string_date(string_date : str) -> pd.Timestamp
        Parses a string date into a pandas Timestamp object.
    
    Methods
    -------
    encode_as_number() -> pd.Series
        Encodes the date as a number, where each day is represented by a unique number.
    """
    
    def __init__(self, column : pd.Series) :
        self.column = column
    
    @staticmethod
    def parse_string_date(string_date : str) :

        date = pd.to_datetime(parser.parse(string_date, fuzzy=True).strftime('%Y-%m-%d'))
        
        return date
        
        
    def encode_as_number(self) :

        years, months, days = 0, 0, 0

        if self.column.dtype == 'datetime64[ns]' :
            pass

        else :
            self.column = self.column.apply(lambda x : DateVariable.parse_string_date(x) if not np.isnan(x) else x)
        
        years, months, days = self.column.dt.year, self.column.dt.month, self.column.dt.day
            
        
        return years * 365 + (months - 1) * 30 + days
    

In [128]:
class NumericVariable() :

    def __init__(self, column : pd.Series) :
        self.column = column
    
    def detect_outlier_iqr(self) :

        q1 = np.percentile(self.column, 25)
        q3 = np.percentile(self.column, 75)
        span = iqr(self.column)
        too_big = span * 1.5 + q3
        too_small = q1 - span * 1.5
        small_outliers_indices = self.column[self.column < too_small].index.tolist()
        big_outliers_indices = self.column[self.column > too_big].index.tolist()
        outliers_indices = small_outliers_indices + big_outliers_indices

        return outliers_indices

In [129]:
class CategoricalData(DataSet) :
    
    def __init__(self, path = None, data = None, max_uniq_vals=10) :
        super().__init__(path, data)
        self.cat_data = self.data.select_dtypes(include='object')
        self.unique_values = self.cat_data.nunique()
        self.cols_to_encode = self.unique_values[self.unique_values <= max_uniq_vals].index.tolist()
        self.cat_data = self.data[self.cols_to_encode]
    
    def encode_data(self, method, show_mapping=False) -> pd.DataFrame:
        
        encoded_data = {}
        
        for column in self.cat_data.columns:
            categorical_col = CategoricalVariable(self.cat_data[column])
            encoded_data[column] = categorical_col.encode_data(method, show_mapping)
        
        df = pd.DataFrame()

        for k, v in encoded_data.items():
            if method == 'ordinal':
                df[k] = v
            elif method == 'one_hot':
                df = pd.concat([df, v], axis=1)
            else:
                raise ValueError(f"Encoding method {method} not recognized.")
        
        return df

In [130]:
class NumericData(DataSet) :
    
    def __init__(self, path = None, data=None) :
        
        super().__init__(path, data)
        self.num_data = self.data.select_dtypes(include='number')
    
    def detect_outliers(self, method = 'iqr', by_column = False) :
            
        outliers = {}
            
        for c in self.num_data.columns :
            
            numeric_col = NumericVariable(self.data[c])
            
            if method == 'iqr' :
                outliers[c] = numeric_col.detect_outlier_iqr()
                
            elif method == 'z_score' :
                pass
            else :
                raise ValueError(f"Outlier detection method {method} not recognized.")
            
        if by_column :
            return outliers

        outlier_indices = []

        for v in outliers.values() :
            outlier_indices += v
            
        return outlier_indices

In [131]:
class DataImputation() :

    def mode_imputation(self, column : pd.Series) :
        
        mode = column.mode()[0]
        
        return column.fillna(mode)
    
    def mean_imputation(self, column : pd.Series) :
            
        mean = column.mean()
            
        return column.fillna(mean)

    def median_imputation(self, column : pd.Series) :
            
        median = column.median()
            
        return column.fillna(median)



In [132]:
deng_train['week_start_date'].head()

0    1990-04-30
1    1990-05-07
2    1990-05-14
3    1990-05-21
4    1990-05-28
Name: week_start_date, dtype: object

In [133]:
daty = DateVariable(pd.to_datetime(deng_train['week_start_date']))
daty.encode_as_number()

0       726470
1       726477
2       726484
3       726491
4       726498
         ...  
1451    733798
1452    733804
1453    733811
1454    733818
1455    733825
Name: week_start_date, Length: 1456, dtype: int64

In [134]:
daty = DateVariable(pd.to_datetime(deng_train['week_start_date']))
daty.encode_as_number()

0       726470
1       726477
2       726484
3       726491
4       726498
         ...  
1451    733798
1452    733804
1453    733811
1454    733818
1455    733825
Name: week_start_date, Length: 1456, dtype: int64

In [135]:
cat_data = CategoricalData(path)

In [136]:
cat_data.unique_values

city                  2
week_start_date    1049
dtype: int64

In [137]:
cat_data.encode_data('one_hot')

Unnamed: 0,city_iq,city_sj
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
1451,1.0,0.0
1452,1.0,0.0
1453,1.0,0.0
1454,1.0,0.0


In [138]:
NumericData(path).data.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [139]:
numeric_data = NumericData(data = pd.DataFrame([*range(10)] + [10 ** 9] + [10 ** 11]))
numeric_data.detect_outliers()

[10, 11]

In [140]:
NumericData(path).data.columns

Index(['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
       'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm'],
      dtype='object')

In [141]:
#self.cat_data = CategoricalData(data = self.data).cat_data
  #      self.num_data = NumericData(data = self.data).num_data

In [176]:
class CleanDataset(CategoricalData, NumericData) :

    def __init__(self, path = None, data = None, date_col_name = None) :
        
        CategoricalData.__init__(self, path = path, data = data)       
        NumericData.__init__(self, path = path, data = data)

        if date_col_name is not None :
            self.date_data = self.data[date_col_name]
    
    def prepare_categoricl_data(self, method = 'one_hot') :
        
        return self.encode_data(method = method) 

    def prepare_numeric_data(self, method = 'iqr', remove_outliers = False) :
        
        if remove_outliers :
            outliers = self.detect_outliers(method)
            indices_keep = [i for i in self.num_data.index if i not in outliers]
            self.num_data = self.num_data.iloc[indices_keep]

            return self.num_data
        
        else :
            
            return self.num_data

    def prepare_date_data(self) :
        
        return DateVariable(self.date_data).encode_as_number()

    
    
    

In [161]:
cnt = True
cnt += 1

In [162]:
cnt

2

In [143]:
path

'datasets\\drivendata\\deng_ai\\dengue_features_train.csv'

In [180]:
CleanDataset(data = deng_train, date_col_name= 'week_start_date').prepare_date_data()

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''