In [1]:
import pandas as pd 
import numpy as np
import sweetviz as sv

from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from challenge_tn.data_factory.feature_calendar_transformer import FeatureCalendarTransformer


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train = pd.read_csv("../../data/Xtrain_hgcGIrA.csv")
y_train = pd.read_csv("../../data/Ytrain_yL5OjS4.csv").filter(["p0q0"])
X_test = pd.read_csv("../../data/Xtest.csv")

print(X_train.shape, X_test.shape)

train_df = pd.concat([X_train, y_train], axis=1)

(31119, 12) (13752, 12)


In [3]:
train_df

Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
0,2019-01-07,1,0,AD,06:00:00,2,,,,0.201,0.138,0.091,0.216
1,2019-01-08,1,0,AD,06:00:00,2,,,,0.204,0.152,0.106,0.216
2,2019-01-10,1,0,AD,06:00:00,2,,,,0.213,0.153,0.111,0.227
3,2019-01-11,1,0,AD,06:00:00,2,,,,0.213,0.152,0.108,0.229
4,2019-01-14,1,0,AD,06:00:00,2,,,,0.210,0.147,0.096,0.225
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31114,2019-05-13,9,0,BE,08:00:00,2,0.152,0.18860,0.157000,0.080,0.100,,0.111
31115,2019-05-14,9,0,BE,08:00:00,2,0.153,0.18040,0.191000,0.089,0.121,,0.143
31116,2019-05-15,9,0,BE,08:00:00,2,0.166,0.14900,0.168000,0.099,0.129,,0.139
31117,2019-03-21,9,0,BE,08:00:00,2,0.182,0.19300,0.162000,0.074,0.101,,0.117


# NaN in hour 

In [4]:
class CleanHourNaNTransformer(BaseEstimator, TransformerMixin):
    
    HOUR_PERIMETER = ["06:00:00", "07:00:00", "08:00:00", "09:00:00"]

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X["hour_float"] = X["hour"].str[:2].astype(float)
        train_hour_min_df = X.groupby(["date", "train"])["hour_float"].min().apply(lambda x: f"{str(int(x)).zfill(2)}:00:00").reset_index(name="hour_float_min")
        X = X.merge(train_hour_min_df, on=["date", "train"], how="left")
        where_first_station = X["p0q1"].isnull() & X["p0q2"].isnull() & X["p0q3"].isnull()
        X.loc[X["hour"].isnull() & where_first_station, "hour"] = X.loc[X["hour"].isnull() & where_first_station, "hour_float_min"]
        X = X[X["hour"].isin(self.HOUR_PERIMETER)]
        X.dropna(subset=["hour"], inplace=True)
        X.drop(columns=["hour_float", "hour_float_min"], inplace=True)
        return X

In [5]:
train_df = CleanHourNaNTransformer().fit_transform(train_df)

# Feature Calendar

In [6]:
from typing import List

class ExtractFeatureCalendarTransformer(FeatureCalendarTransformer):

    CREATED_COLUMNS_TO_DROP: List[str] = [
            "unix_second", "year", "french_holiday_zone_a", "french_holiday_zone_b", "french_holiday_zone_c", 
            "is_weekend", "french_bank_holiday", "french_holiday_zone_at_least_in_one_zone", 
            "days_since_previous_french_bank_holiday", "hour_of_the_week", "hour_of_the_year"
        ]
    
    def __init__(self, include_cyclic_transform: bool = False):
        super().__init__("full_date", True, include_cyclic_transform)

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.date_to_use = "full_date"
        self.create_hour_features = True
        X = X.assign(
            full_date=lambda x: x["date"] + " " + x["hour"],
        )
        X["full_date"] = pd.to_datetime(X["full_date"], format="%Y-%m-%d %H:%M:%S")
        X = self.fit(X).transform(X)
        X = X.drop(columns=self.CREATED_COLUMNS_TO_DROP)
        return X

In [7]:
train_df = ExtractFeatureCalendarTransformer().fit_transform(train_df)

# Variation feature 

In [8]:
import itertools

class PqFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    
    P_COLUMN_NAMES: List = ["p0q3", "p0q1", "p0q2"]
    Q_COLUMN_NAMES: List = ["p3q0", "p1q0", "p2q0"]
    PQ_COLUMN_NAMES: List = ["p0q3", "p0q1", "p0q2", "p3q0", "p1q0", "p2q0"]

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X = self.build_variation_pq_feature(X, "p0")
        X = self.build_variation_pq_feature(X, "q0")
        X = self.build_variation_pq_per_day_station(X)
        X = self.build_stats_features_on_pq(X)
        X = self.extract_nan_information_from_pq_features(X)
        X = self.replace_nan_by_zero(X)
        return X

    def build_variation_pq_feature(self, df: pd.DataFrame, which_target: str="p0")->pd.DataFrame: 
        if which_target == "p0":
            n1, n2, n3 = f"{which_target}q1", f"{which_target}q2", f"{which_target}q3"
        elif which_target == "q0":
            n1, n2, n3 = f"p1{which_target}", f"p2{which_target}", f"p3{which_target}"
        else:
            raise ValueError("'q0' or 'p0'")
        df[f"{which_target}_list"] = list(zip(df[n1].fillna(0), df[n2].fillna(0), df[n3].fillna(0)))
    
        df[f"{which_target}_combination_list"] = [
            list(itertools.combinations(elt, 2))
            for elt in df[f"{which_target}_list"]
        ]
        df[f"{which_target}_variation_max"] = [
            np.max([abs(a - b) for a, b in elt])
            for elt in df[f"{which_target}_combination_list"]
        ]
        df[f"{which_target}_variation_min"] = [
            np.min([abs(a - b) for a, b in elt])
            for elt in df[f"{which_target}_combination_list"]
        ]
        df.drop(columns=[f"{which_target}_list", f"{which_target}_combination_list"], inplace=True)
        return df
    
    def build_variation_pq_per_day_station(self, df: pd.DataFrame)->pd.DataFrame:
        variation_per_day_station = df.groupby(["station", "day_of_the_week", "hour"]).agg({"q0_variation_max" : np.mean, "q0_variation_min" : np.mean, "p0_variation_max" : np.mean, "p0_variation_min": np.mean}).reset_index()
        variation_per_day_station["station_variation_p0"] = variation_per_day_station[["p0_variation_max", "p0_variation_min"]].mean(axis=1)
        variation_per_day_station["station_variation_q0"] = variation_per_day_station[["q0_variation_max", "q0_variation_min"]].mean(axis=1)
        variation_per_day_station = variation_per_day_station[["station", "day_of_the_week", "hour", "station_variation_p0", "station_variation_q0"]]
        df = df.merge(variation_per_day_station, on=["station", "day_of_the_week", "hour"], how="left")
        return df
    
    def build_stats_features_on_pq(self, df: pd.DataFrame)->pd.DataFrame:
        df["mean_p0"] = df[self.P_COLUMN_NAMES].mean(axis=1, skipna=True).fillna(0)
        df["mean_q0"] = df[self.Q_COLUMN_NAMES].mean(axis=1, skipna=True).fillna(0)
        df["median_p0"] = df[self.P_COLUMN_NAMES].median(axis=1, skipna=True).fillna(0)
        df["median_q0"] = df[self.Q_COLUMN_NAMES].median(axis=1, skipna=True).fillna(0)
        df["std_p0"] = df[self.P_COLUMN_NAMES].std(axis=1, skipna=True).fillna(0)
        df["std_q0"] = df[self.Q_COLUMN_NAMES].std(axis=1, skipna=True).fillna(0)
        return df
    
    def extract_nan_information_from_pq_features(self, df: pd.DataFrame)->pd.DataFrame:
        df["info_missing_p0"] = df["p0q1"].isnull().astype(int) + df["p0q2"].isnull().astype(int) + df["p0q3"].isnull().astype(int)
        df["info_missing_q0"] = df["p1q0"].isnull().astype(int) + df["p2q0"].isnull().astype(int) + df["p3q0"].isnull().astype(int)
        df["start_q0"] = (df["info_missing_q0"] == 3).astype(int)
        df["start_p0"] = (df["info_missing_p0"] == 3).astype(int)
        return df
    
    def replace_nan_by_zero(self, df: pd.DataFrame)->pd.DataFrame:
        for pq_col in self.PQ_COLUMN_NAMES:
            df[pq_col] = df[pq_col].fillna(0)
        return df

In [9]:
train_df = PqFeatureEngineeringTransformer().fit_transform(train_df)

In [10]:
print(train_df.shape)
train_df.head()

(30832, 41)


Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,full_date,day_of_the_week,day_of_the_month,day_of_the_year,week_of_the_year,month_of_the_year,quarter,european_bank_holiday_target2,days_until_next_french_bank_holiday,distance_in_days_from_french_bank_holiday,hour_of_the_day,hour_of_the_month,p0_variation_max,p0_variation_min,q0_variation_max,q0_variation_min,station_variation_p0,station_variation_q0,mean_p0,mean_q0,median_p0,median_q0,std_p0,std_q0,info_missing_p0,info_missing_q0,start_q0,start_p0
0,2019-01-07,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.201,0.138,0.091,0.216,2019-01-07 06:00:00,1.0,7.0,7.0,2.0,1.0,1.0,0.0,105.0,6.0,6.0,174.0,0.11,0.047,0.0,0.0,0.07963,0.037304,0.143333,0.0,0.138,0.0,0.055194,0.0,0,3,1,0
1,2019-01-08,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.204,0.152,0.106,0.216,2019-01-08 06:00:00,2.0,8.0,8.0,2.0,1.0,1.0,0.0,104.0,7.0,6.0,198.0,0.098,0.046,0.0,0.0,0.0839,0.053212,0.154,0.0,0.152,0.0,0.049031,0.0,0,3,1,0
2,2019-01-10,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.213,0.153,0.111,0.227,2019-01-10 06:00:00,4.0,10.0,10.0,2.0,1.0,1.0,0.0,102.0,9.0,6.0,246.0,0.102,0.042,0.0,0.0,0.078783,0.048217,0.159,0.0,0.153,0.0,0.051264,0.0,0,3,1,0
3,2019-01-11,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.213,0.152,0.108,0.229,2019-01-11 06:00:00,5.0,11.0,11.0,2.0,1.0,1.0,0.0,101.0,10.0,6.0,270.0,0.105,0.044,0.0,0.0,0.074761,0.059761,0.157667,0.0,0.152,0.0,0.052729,0.0,0,3,1,0
4,2019-01-14,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.21,0.147,0.096,0.225,2019-01-14 06:00:00,1.0,14.0,14.0,3.0,1.0,1.0,0.0,98.0,13.0,6.0,342.0,0.114,0.051,0.0,0.0,0.07963,0.037304,0.151,0.0,0.147,0.0,0.057105,0.0,0,3,1,0


# Process 

In [11]:
class StationFeatureTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X = self.build_nb_distinct_station_per_train(X)
        return X
    
    def build_nb_distinct_station_per_train(self, df: pd.DataFrame) -> pd.DataFrame:
        station_per_train = df.groupby(["train"]).agg({"station": "nunique"}).reset_index().rename(columns={"station" : "nb_distinct_station_per_train"})
        df = df.merge(station_per_train, on=["train"], how="left")
        return df

In [12]:
train_df = StationFeatureTransformer().fit_transform(train_df)

In [13]:
print(train_df.shape)
train_df.tail()

(30832, 42)


Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,full_date,day_of_the_week,day_of_the_month,day_of_the_year,week_of_the_year,month_of_the_year,quarter,european_bank_holiday_target2,days_until_next_french_bank_holiday,distance_in_days_from_french_bank_holiday,hour_of_the_day,hour_of_the_month,p0_variation_max,p0_variation_min,q0_variation_max,q0_variation_min,station_variation_p0,station_variation_q0,mean_p0,mean_q0,median_p0,median_q0,std_p0,std_q0,info_missing_p0,info_missing_q0,start_q0,start_p0,nb_distinct_station_per_train
30827,2019-05-13,9,0,BE,08:00:00,2,0.152,0.1886,0.157,0.08,0.1,0.0,0.111,2019-05-13 08:00:00,1.0,13.0,133.0,20.0,5.0,2.0,0.0,17.0,5.0,8.0,320.0,0.1,0.02,0.0366,0.005,0.07399,0.030115,0.09,0.165867,0.09,0.157,0.014142,0.019846,1,0,0,0,11
30828,2019-05-14,9,0,BE,08:00:00,2,0.153,0.1804,0.191,0.089,0.121,0.0,0.143,2019-05-14 08:00:00,2.0,14.0,134.0,20.0,5.0,2.0,0.0,16.0,6.0,8.0,344.0,0.121,0.032,0.038,0.0106,0.075556,0.031389,0.105,0.1748,0.105,0.1804,0.022627,0.019609,1,0,0,0,11
30829,2019-05-15,9,0,BE,08:00:00,2,0.166,0.149,0.168,0.099,0.129,0.0,0.139,2019-05-15 08:00:00,3.0,15.0,135.0,20.0,5.0,2.0,0.0,15.0,7.0,8.0,368.0,0.129,0.03,0.019,0.002,0.074711,0.024195,0.114,0.161,0.114,0.166,0.021213,0.01044,1,0,0,0,11
30830,2019-03-21,9,0,BE,08:00:00,2,0.182,0.193,0.162,0.074,0.101,0.0,0.117,2019-03-21 08:00:00,4.0,21.0,80.0,12.0,3.0,1.0,0.0,32.0,32.0,8.0,512.0,0.101,0.027,0.031,0.011,0.080326,0.033712,0.0875,0.179,0.0875,0.182,0.019092,0.015716,1,0,0,0,11
30831,2019-01-30,9,0,BE,08:00:00,1,0.173,0.16928,0.187717,0.289,0.354,0.0,0.416,2019-01-30 08:00:00,3.0,30.0,30.0,5.0,1.0,1.0,0.0,82.0,29.0,8.0,728.0,0.354,0.065,0.018437,0.00372,0.074711,0.024195,0.3215,0.176666,0.3215,0.173,0.045962,0.00975,1,0,0,0,11


# Anomalic loading

In [14]:
class AnomalicLoadingTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        mini = np.mean(X["p0q0"]) - np.std(X["p0q0"]) * 1.25
        maxi = np.mean(X["p0q0"]) + np.std(X["p0q0"]) * 1.5
        X["superior_anomalic_loading"] = (X["p0q0"] > maxi).astype(int) # 9 %
        X["inferior_anomalic_loading"] = (X["p0q0"] < mini).astype(int) # 7.6 %
        return X 

In [15]:
train_df = AnomalicLoadingTransformer().fit_transform(train_df)

In [16]:
train_df.head()

Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,full_date,day_of_the_week,day_of_the_month,day_of_the_year,week_of_the_year,month_of_the_year,quarter,european_bank_holiday_target2,days_until_next_french_bank_holiday,distance_in_days_from_french_bank_holiday,hour_of_the_day,hour_of_the_month,p0_variation_max,p0_variation_min,q0_variation_max,q0_variation_min,station_variation_p0,station_variation_q0,mean_p0,mean_q0,median_p0,median_q0,std_p0,std_q0,info_missing_p0,info_missing_q0,start_q0,start_p0,nb_distinct_station_per_train,superior_anomalic_loading,inferior_anomalic_loading
0,2019-01-07,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.201,0.138,0.091,0.216,2019-01-07 06:00:00,1.0,7.0,7.0,2.0,1.0,1.0,0.0,105.0,6.0,6.0,174.0,0.11,0.047,0.0,0.0,0.07963,0.037304,0.143333,0.0,0.138,0.0,0.055194,0.0,0,3,1,0,11,0,0
1,2019-01-08,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.204,0.152,0.106,0.216,2019-01-08 06:00:00,2.0,8.0,8.0,2.0,1.0,1.0,0.0,104.0,7.0,6.0,198.0,0.098,0.046,0.0,0.0,0.0839,0.053212,0.154,0.0,0.152,0.0,0.049031,0.0,0,3,1,0,11,0,0
2,2019-01-10,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.213,0.153,0.111,0.227,2019-01-10 06:00:00,4.0,10.0,10.0,2.0,1.0,1.0,0.0,102.0,9.0,6.0,246.0,0.102,0.042,0.0,0.0,0.078783,0.048217,0.159,0.0,0.153,0.0,0.051264,0.0,0,3,1,0,11,0,0
3,2019-01-11,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.213,0.152,0.108,0.229,2019-01-11 06:00:00,5.0,11.0,11.0,2.0,1.0,1.0,0.0,101.0,10.0,6.0,270.0,0.105,0.044,0.0,0.0,0.074761,0.059761,0.157667,0.0,0.152,0.0,0.052729,0.0,0,3,1,0,11,0,0
4,2019-01-14,1,0,AD,06:00:00,2,0.0,0.0,0.0,0.21,0.147,0.096,0.225,2019-01-14 06:00:00,1.0,14.0,14.0,3.0,1.0,1.0,0.0,98.0,13.0,6.0,342.0,0.114,0.051,0.0,0.0,0.07963,0.037304,0.151,0.0,0.147,0.0,0.057105,0.0,0,3,1,0,11,0,0


# Process before split columns to use 

In [17]:
from sklearn.preprocessing import LabelEncoder

class ProcessingBeforeSplitTransformer(BaseEstimator, TransformerMixin):
    
    COLUMNS_TO_KEEP = []
    # COLUMNS_TO_DROP = ["full_date", "composition", "way", "hour"]
    COLUMNS_TO_DROP = ["date", "full_date", "composition", "way", "hour"]

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    # TODO : 
    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X = X.drop(columns=self.COLUMNS_TO_DROP)
        self.station_le = LabelEncoder()
        X["station"] = self.station_le.fit_transform(X["station"])
        self.station_le_mapping = {elt : idx for idx, elt in enumerate(self.station_le.classes_)}
        return X

In [18]:
train_df = ProcessingBeforeSplitTransformer().fit_transform(train_df)

In [19]:
train_df.head()

Unnamed: 0,train,station,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,day_of_the_week,day_of_the_month,day_of_the_year,week_of_the_year,month_of_the_year,quarter,european_bank_holiday_target2,days_until_next_french_bank_holiday,distance_in_days_from_french_bank_holiday,hour_of_the_day,hour_of_the_month,p0_variation_max,p0_variation_min,q0_variation_max,q0_variation_min,station_variation_p0,station_variation_q0,mean_p0,mean_q0,median_p0,median_q0,std_p0,std_q0,info_missing_p0,info_missing_q0,start_q0,start_p0,nb_distinct_station_per_train,superior_anomalic_loading,inferior_anomalic_loading
0,1,3,0.0,0.0,0.0,0.201,0.138,0.091,0.216,1.0,7.0,7.0,2.0,1.0,1.0,0.0,105.0,6.0,6.0,174.0,0.11,0.047,0.0,0.0,0.07963,0.037304,0.143333,0.0,0.138,0.0,0.055194,0.0,0,3,1,0,11,0,0
1,1,3,0.0,0.0,0.0,0.204,0.152,0.106,0.216,2.0,8.0,8.0,2.0,1.0,1.0,0.0,104.0,7.0,6.0,198.0,0.098,0.046,0.0,0.0,0.0839,0.053212,0.154,0.0,0.152,0.0,0.049031,0.0,0,3,1,0,11,0,0
2,1,3,0.0,0.0,0.0,0.213,0.153,0.111,0.227,4.0,10.0,10.0,2.0,1.0,1.0,0.0,102.0,9.0,6.0,246.0,0.102,0.042,0.0,0.0,0.078783,0.048217,0.159,0.0,0.153,0.0,0.051264,0.0,0,3,1,0,11,0,0
3,1,3,0.0,0.0,0.0,0.213,0.152,0.108,0.229,5.0,11.0,11.0,2.0,1.0,1.0,0.0,101.0,10.0,6.0,270.0,0.105,0.044,0.0,0.0,0.074761,0.059761,0.157667,0.0,0.152,0.0,0.052729,0.0,0,3,1,0,11,0,0
4,1,3,0.0,0.0,0.0,0.21,0.147,0.096,0.225,1.0,14.0,14.0,3.0,1.0,1.0,0.0,98.0,13.0,6.0,342.0,0.114,0.051,0.0,0.0,0.07963,0.037304,0.151,0.0,0.147,0.0,0.057105,0.0,0,3,1,0,11,0,0


# Split 

In [20]:
from challenge_tn.modules.models.split import SplitTransformer

In [21]:
cols_for_split = ["train"]

res_df = SplitTransformer(
    train_size = 0.85,
    splitting_keys = cols_for_split, 
    random_state = 555,
).fit_transform(train_df.sort_values(cols_for_split))

In [22]:
res_df.keys()

dict_keys(['train_sets', 'valid_sets'])

In [23]:
from typing import List, Optional
from challenge_tn.modules.models.split import SplitTransformer

class OptimizationSplitTransformer(SplitTransformer):
    
    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X = X.sort_values(self.splitting_keys)
        sets_dict_first_split = self.fit(X).transform(X)
        valid_df = sets_dict_first_split["valid_sets"].copy()
        self.init_train_size = self.train_size
        self.train_size = 0.5
        sets_dict_second_split = self.fit(valid_df).transform(valid_df)
        self.train_size = self.init_train_size
        return {
            "train_sets" : sets_dict_first_split["train_sets"],
            "valid_sets" : sets_dict_second_split["train_sets"],
            "test_sets" : sets_dict_second_split["valid_sets"],
        }

In [63]:
cols_for_split = ["train"]

res_df = OptimizationSplitTransformer(
    train_size = 0.7,
    splitting_keys = cols_for_split, 
    random_state = 555,
).fit_transform(train_df)

In [64]:
res_df["train_sets"]["p0q0"]
res_df["train_sets"].drop(columns=["p0q0"])

Unnamed: 0,train,station,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,day_of_the_week,day_of_the_month,day_of_the_year,week_of_the_year,month_of_the_year,quarter,european_bank_holiday_target2,days_until_next_french_bank_holiday,distance_in_days_from_french_bank_holiday,hour_of_the_day,hour_of_the_month,p0_variation_max,p0_variation_min,q0_variation_max,q0_variation_min,station_variation_p0,station_variation_q0,mean_p0,mean_q0,median_p0,median_q0,std_p0,std_q0,info_missing_p0,info_missing_q0,start_q0,start_p0,nb_distinct_station_per_train,superior_anomalic_loading,inferior_anomalic_loading
0,1,3,0.000000,0.0000,0.00000,0.201,0.138,0.091,1.0,7.0,7.0,2.0,1.0,1.0,0.0,105.0,6.0,6.0,174.0,0.110,0.047,0.000000,0.000000,0.079630,0.037304,0.143333,0.000000,0.138,0.0000,0.055194,0.000000,0,3,1,0,11,0,0
421,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,5.0,1.0,32.0,5.0,2.0,1.0,0.0,80.0,31.0,6.0,30.0,0.000,0.000,0.000000,0.000000,0.000000,0.020368,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1
422,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,1.0,4.0,35.0,6.0,2.0,1.0,0.0,77.0,34.0,6.0,102.0,0.000,0.000,0.000000,0.000000,0.000000,0.019347,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1
423,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,2.0,5.0,36.0,6.0,2.0,1.0,0.0,76.0,35.0,6.0,126.0,0.000,0.000,0.000000,0.000000,0.000000,0.020387,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1
424,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,3.0,6.0,37.0,6.0,2.0,1.0,0.0,75.0,36.0,6.0,150.0,0.000,0.000,0.000000,0.000000,0.000000,0.022441,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27297,54,6,0.375000,0.3720,0.44600,0.271,0.228,0.204,5.0,11.0,11.0,2.0,1.0,1.0,0.0,101.0,10.0,8.0,272.0,0.067,0.024,0.074000,0.003000,0.069111,0.086544,0.234333,0.397667,0.228,0.3750,0.033946,0.041885,0,0,0,0,9,0,0
27294,54,6,0.423000,0.3810,0.52191,0.275,0.238,0.203,2.0,8.0,8.0,2.0,1.0,1.0,0.0,104.0,7.0,8.0,200.0,0.072,0.035,0.140910,0.042000,0.081412,0.099386,0.238667,0.441970,0.238,0.4230,0.036005,0.072345,0,0,0,0,9,0,0
27293,54,2,0.250000,0.3520,0.48300,0.244,0.132,0.208,4.0,21.0,80.0,12.0,3.0,1.0,0.0,32.0,32.0,8.0,512.0,0.112,0.036,0.233000,0.102000,0.059480,0.112328,0.194667,0.361667,0.208,0.3520,0.057178,0.116800,0,0,0,0,9,0,0
27292,54,2,0.270954,0.3614,0.46080,0.214,0.100,0.184,3.0,17.0,107.0,16.0,4.0,2.0,0.0,5.0,5.0,8.0,416.0,0.114,0.030,0.189846,0.090446,0.050705,0.102882,0.166000,0.364385,0.184,0.3614,0.059093,0.094958,0,0,0,0,9,0,0


# Create basic regression feature for PQ

In [65]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

def run_metrics(y_test, y_pred):
    return {  
        "MeanAE" : mean_absolute_error(y_test, y_pred),
        "MedAE" : median_absolute_error(y_test, y_pred),
        "R2" : r2_score(y_test, y_pred),
        "MSE" : mean_squared_error(y_test, y_pred),
        "RMSE" : np.sqrt(mean_squared_error(y_test, y_pred)),
    }
    
class SimpleRegressionFeaturesTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, res_df: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    def transform(self, res_df: pd.DataFrame, y=None) -> pd.DataFrame:
        self.retrive_datasets_for_training(res_df)
        models_informations = {}
        for idx in range(1, 4):
            lreg = LinearRegression()
            model_name = f"p0q0_hat_from_var_{idx}"
            lreg = lreg.fit(self.X_train[[f"p{idx}q0", f"p0q{idx}"]], self.y_train)
            self.X_train[model_name] = lreg.predict(self.X_train[[f"p{idx}q0", f"p0q{idx}"]])
            self.X_valid[model_name] = lreg.predict(self.X_valid[[f"p{idx}q0", f"p0q{idx}"]])
            self.X_test[model_name] = lreg.predict(self.X_test[[f"p{idx}q0", f"p0q{idx}"]])
            models_informations[model_name] = {
                "model" : lreg,
                "performance_valid" : run_metrics(self.y_valid, self.X_valid[model_name]),
                "performance_test" : run_metrics(self.y_test, self.X_test[model_name]),
            }
        return {
            "train_sets" : pd.concat([self.X_train, self.y_train], axis=1),
            "valid_sets" : pd.concat([self.X_valid, self.y_valid], axis=1),
            "test_sets" : pd.concat([self.X_test, self.y_test], axis=1),
            "simple_regression" : models_informations,
        } 
    
    def retrive_datasets_for_training(self, res_df):
        self.y_train = res_df["train_sets"]["p0q0"]
        self.X_train = res_df["train_sets"].drop(columns=["p0q0"])
        self.y_valid = res_df["valid_sets"]["p0q0"]
        self.X_valid = res_df["valid_sets"].drop(columns=["p0q0"])
        self.y_test = res_df["test_sets"]["p0q0"]
        self.X_test = res_df["test_sets"].drop(columns=["p0q0"])


In [66]:
final_res_df = SimpleRegressionFeaturesTransformer().fit_transform(res_df)

In [67]:
final_res_df.keys()

dict_keys(['train_sets', 'valid_sets', 'test_sets', 'simple_regression'])

In [68]:
final_res_df["train_sets"]

Unnamed: 0,train,station,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,day_of_the_week,day_of_the_month,day_of_the_year,week_of_the_year,month_of_the_year,quarter,european_bank_holiday_target2,days_until_next_french_bank_holiday,distance_in_days_from_french_bank_holiday,hour_of_the_day,hour_of_the_month,p0_variation_max,p0_variation_min,q0_variation_max,q0_variation_min,station_variation_p0,station_variation_q0,mean_p0,mean_q0,median_p0,median_q0,std_p0,std_q0,info_missing_p0,info_missing_q0,start_q0,start_p0,nb_distinct_station_per_train,superior_anomalic_loading,inferior_anomalic_loading,p0q0_hat_from_var_1,p0q0_hat_from_var_2,p0q0_hat_from_var_3,p0q0
0,1,3,0.000000,0.0000,0.00000,0.201,0.138,0.091,1.0,7.0,7.0,2.0,1.0,1.0,0.0,105.0,6.0,6.0,174.0,0.110,0.047,0.000000,0.000000,0.079630,0.037304,0.143333,0.000000,0.138,0.0000,0.055194,0.000000,0,3,1,0,11,0,0,0.248465,0.189154,0.152601,0.216
421,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,5.0,1.0,32.0,5.0,2.0,1.0,0.0,80.0,31.0,6.0,30.0,0.000,0.000,0.000000,0.000000,0.000000,0.020368,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1,0.051669,0.071772,0.087517,0.044
422,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,1.0,4.0,35.0,6.0,2.0,1.0,0.0,77.0,34.0,6.0,102.0,0.000,0.000,0.000000,0.000000,0.000000,0.019347,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1,0.051669,0.071772,0.087517,0.043
423,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,2.0,5.0,36.0,6.0,2.0,1.0,0.0,76.0,35.0,6.0,126.0,0.000,0.000,0.000000,0.000000,0.000000,0.020387,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1,0.051669,0.071772,0.087517,0.047
424,1,23,0.000000,0.0000,0.00000,0.000,0.000,0.000,3.0,6.0,37.0,6.0,2.0,1.0,0.0,75.0,36.0,6.0,150.0,0.000,0.000,0.000000,0.000000,0.000000,0.022441,0.000000,0.000000,0.000,0.0000,0.000000,0.000000,3,3,1,1,11,0,1,0.051669,0.071772,0.087517,0.047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27297,54,6,0.375000,0.3720,0.44600,0.271,0.228,0.204,5.0,11.0,11.0,2.0,1.0,1.0,0.0,101.0,10.0,8.0,272.0,0.067,0.024,0.074000,0.003000,0.069111,0.086544,0.234333,0.397667,0.228,0.3750,0.033946,0.041885,0,0,0,0,9,0,0,0.349459,0.314156,0.335651,0.342
27294,54,6,0.423000,0.3810,0.52191,0.275,0.238,0.203,2.0,8.0,8.0,2.0,1.0,1.0,0.0,104.0,7.0,8.0,200.0,0.072,0.035,0.140910,0.042000,0.081412,0.099386,0.238667,0.441970,0.238,0.4230,0.036005,0.072345,0,0,0,0,9,0,0,0.357530,0.323834,0.352336,0.351
27293,54,2,0.250000,0.3520,0.48300,0.244,0.132,0.208,4.0,21.0,80.0,12.0,3.0,1.0,0.0,32.0,32.0,8.0,512.0,0.112,0.036,0.233000,0.102000,0.059480,0.112328,0.194667,0.361667,0.208,0.3520,0.057178,0.116800,0,0,0,0,9,0,0,0.312204,0.229894,0.346993,0.279
27292,54,2,0.270954,0.3614,0.46080,0.214,0.100,0.184,3.0,17.0,107.0,16.0,4.0,2.0,0.0,5.0,5.0,8.0,416.0,0.114,0.030,0.189846,0.090446,0.050705,0.102882,0.166000,0.364385,0.184,0.3614,0.059093,0.094958,0,0,0,0,9,0,0,0.284645,0.203899,0.324740,0.252


# Sup - Min : prediction feature

In [69]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, average_precision_score, confusion_matrix, roc_auc_score, f1_score


def run_metrics(y_test, y_pred):
    return {  
        "MeanAE" : mean_absolute_error(y_test, y_pred),
        "MedAE" : median_absolute_error(y_test, y_pred),
        "R2" : r2_score(y_test, y_pred),
        "MSE" : mean_squared_error(y_test, y_pred),
        "RMSE" : np.sqrt(mean_squared_error(y_test, y_pred)),
    }
    
def run_metrics_clf(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return {
        "accuracy_score": accuracy_score(y_true, y_pred),
        "precision_score": precision,
        "recall_score": recall,
        "f1_score": f1_score(y_true, y_pred),
        f"f{0.5}_score": ((1 + 0.5 ** 2) * precision * recall) / (0.5 ** 2 * precision + recall),
        f"f{2}_score": ((1 + 2 ** 2) * precision * recall) / (2 ** 2 * precision + recall),
    }
    
class AnomalicLoadingFeaturesTransformer(BaseEstimator, TransformerMixin):
    
    COLS_MIN_LOADING = ["p0q1", "p0q2", "p0q3", "station", "info_missing_p0", "start_p0", "station_variation_p0", "p0_variation_min", "p0_variation_max", "std_p0"]
    COLS_MAX_LOADING = ["p0q1", "p0q2", "p0q3", "p1q0", "p2q0", "p3q0", "mean_p0", "mean_q0", "station", "p0_variation_min", "station_variation_p0"]
    
    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    def transform(self, res_df: pd.DataFrame, y=None) -> pd.DataFrame:
        self.retrive_datasets_for_training(res_df)
        models_informations = {}
        for model_name in ["inferior_anomalic_loading", "superior_anomalic_loading"]:
            cols_to_keep = self.COLS_MIN_LOADING if model_name == "inferior_anomalic_loading" else self.COLS_MAX_LOADING
            logreg = LogisticRegression()
            logreg = logreg.fit(self.X_train.filter(cols_to_keep), self.X_train[model_name])
            predict_name = model_name + "_hat"
            y_valid_hat = logreg.predict(self.X_valid.filter(cols_to_keep))
            y_test_hat = logreg.predict(self.X_test.filter(cols_to_keep))
            models_informations[predict_name] = {
                "model" : logreg,
                "performance_valid" : run_metrics_clf(self.X_valid[model_name], y_valid_hat),
                "performance_test" : run_metrics_clf(self.X_test[model_name], y_test_hat),
            }
            self.X_valid[model_name] = y_valid_hat
            self.X_test[model_name] = y_test_hat
        return {**res_df , **{"anomalic_loading" : models_informations}}
    
    def retrive_datasets_for_training(self, res_df):
        self.y_train = res_df["train_sets"]["p0q0"]
        self.X_train = res_df["train_sets"].drop(columns=["p0q0"])
        self.y_valid = res_df["valid_sets"]["p0q0"]
        self.X_valid = res_df["valid_sets"].drop(columns=["p0q0"])
        self.y_test = res_df["test_sets"]["p0q0"]
        self.X_test = res_df["test_sets"].drop(columns=["p0q0"])


In [70]:
final_result = AnomalicLoadingFeaturesTransformer().fit_transform(final_res_df)

In [85]:
final_result["valid_sets"].filter(["p0q1", "p0q2", "p0q3", "station", "info_missing_p0", "start_p0", "station_variation_p0", "p0_variation_min", "p0_variation_max", "std_p0"])

Unnamed: 0,p0q1,p0q2,p0q3,station,info_missing_p0,start_p0,station_variation_p0,p0_variation_min,p0_variation_max,std_p0
29366,0.000,0.000,0.000,23,3,1,0.000000,0.000,0.000,0.000000
29371,0.000,0.000,0.000,23,3,1,0.000000,0.000,0.000,0.000000
29367,0.000,0.000,0.000,23,3,1,0.000000,0.000,0.000,0.000000
29368,0.000,0.000,0.000,23,3,1,0.000000,0.000,0.000,0.000000
29369,0.000,0.000,0.000,23,3,1,0.000000,0.000,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...
21056,0.089,0.131,0.070,7,0,0,0.044115,0.019,0.061,0.031214
21070,0.131,0.186,0.225,9,0,0,0.041862,0.039,0.094,0.047226
21069,0.080,0.127,0.063,7,0,0,0.041063,0.017,0.064,0.033151
21068,0.082,0.130,0.067,7,0,0,0.043192,0.015,0.063,0.032909


In [87]:
final_result["anomalic_loading"].keys()

dict_keys(['inferior_anomalic_loading_hat', 'superior_anomalic_loading_hat'])

In [96]:


final_result["anomalic_loading"]["inferior_anomalic_loading_hat"]["model"].predict_proba(final_result["valid_sets"].filter(AnomalicLoadingFeaturesTransformer().COLS_MIN_LOADING))


array([[5.99029214e-01, 4.00970786e-01],
       [5.59481863e-01, 4.40518137e-01],
       [5.74258309e-01, 4.25741691e-01],
       ...,
       [9.62463413e-01, 3.75365867e-02],
       [9.65152861e-01, 3.48471390e-02],
       [9.99223317e-01, 7.76683207e-04]])

In [71]:
final_result.keys()

dict_keys(['train_sets', 'valid_sets', 'test_sets', 'simple_regression', 'anomalic_loading'])

In [72]:
final_result["valid_sets"].inferior_anomalic_loading.value_counts(normalize=True)

0    0.917423
1    0.082577
Name: inferior_anomalic_loading, dtype: float64

In [73]:
final_result["train_sets"].inferior_anomalic_loading.value_counts(normalize=True)

0    0.915207
1    0.084793
Name: inferior_anomalic_loading, dtype: float64

In [74]:
final_result["test_sets"].inferior_anomalic_loading.value_counts(normalize=True)

0    0.975779
1    0.024221
Name: inferior_anomalic_loading, dtype: float64

In [75]:
final_result["anomalic_loading"]

{'inferior_anomalic_loading_hat': {'model': LogisticRegression(),
  'performance_valid': {'accuracy_score': 0.9474708171206225,
   'precision_score': 1.0,
   'recall_score': 0.36387434554973824,
   'f1_score': 0.5335892514395394,
   'f0.5_score': 0.7409381663113007,
   'f2_score': 0.4169166166766647},
  'performance_test': {'accuracy_score': 0.9848615916955017,
   'precision_score': 0.6329113924050633,
   'recall_score': 0.8928571428571429,
   'f1_score': 0.7407407407407407,
   'f0.5_score': 0.6720430107526881,
   'f2_score': 0.8250825082508252}},
 'superior_anomalic_loading_hat': {'model': LogisticRegression(),
  'performance_valid': {'accuracy_score': 0.9617380025940337,
   'precision_score': 0.8441247002398081,
   'recall_score': 0.7586206896551724,
   'f1_score': 0.7990919409761634,
   'f0.5_score': 0.8255159474671668,
   'f2_score': 0.774307083150022},
  'performance_test': {'accuracy_score': 0.9630190311418685,
   'precision_score': 0.9176755447941889,
   'recall_score': 0.734496

# Sup - Min : prediction feature

In [93]:
def build_extremum_to_predict(df: pd.DataFrame)->pd.DataFrame:
        mini = np.mean(df["p0q0"]) - np.std(df["p0q0"]) * 1.25
        maxi = np.mean(df["p0q0"]) + np.std(df["p0q0"]) * 1.5
        df["sup_charge"] = (df["p0q0"] > maxi).astype(int) # 9 %
        df["min_charge"] = (df["p0q0"] < mini).astype(int) # 7.6 %
        return df

In [95]:
analysis_df = build_extremum_to_predict(train_df)#[["min_charge", "sup_charge"]].value_counts(normalize=True)

# Simple Regression

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

def run_metrics(y_test, y_pred):
    return {  
        "MeanAE" : mean_absolute_error(y_test, y_pred),
        "MedAE" : median_absolute_error(y_test, y_pred),
        "R2" : r2_score(y_test, y_pred),
        "MSE" : mean_squared_error(y_test, y_pred),
        "RMSE" : np.sqrt(mean_squared_error(y_test, y_pred)),
    }
    
class SimpleRegressionFeaturesTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        for idx in range(1, 4):
            lreg = LinearRegression()
            lreg = lreg.fit(X_train[[f"p{idx}q0", f"p0q{idx}"]], y_train)
            X_train[f"p0q0_hat_from_var_{idx}"] = lreg.predict(X_train[[f"p{idx}q0", f"p0q{idx}"]].fillna(0))
            X_valid[f"p0q0_hat_from_var_{idx}"] = lreg.predict(X_valid[[f"p{idx}q0", f"p0q{idx}"]].fillna(0))
            print(f"Modèle {idx} :", run_metrics(y_valid, X_valid[f"p0q0_hat_from_var_{idx}"]))
        return self 

# SweetViz

In [96]:
import sweetviz as sv

my_report = sv.analyze(analysis_df, target_feat = 'p0q0')
my_report.show_html(filepath=f'./sweetviz_analysis_p0q0.html')

my_report = sv.analyze(analysis_df, target_feat = 'sup_charge')
my_report.show_html(filepath=f'./sweetviz_analysis_sup_charge.html')

my_report = sv.analyze(analysis_df, target_feat = 'min_charge')
my_report.show_html(filepath=f'./sweetviz_analysis_min_charge.html')

  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.

Report ./sweetviz_analysis_p0q0.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["

Report ./sweetviz_analysis_sup_charge.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["

Report ./sweetviz_analysis_min_charge.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
