In [1]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]
adult_data = pd.read_csv(url, names=columns)
adult_data["marital-status"] = adult_data["marital-status"].replace(
    {" Married-AF-spouse": " Married-civ-spouse"}
)
adult_data["native-country"] = adult_data["native-country"].mask(
    adult_data["native-country"].map(
        adult_data["native-country"].value_counts(normalize=True)
    )
    < 0.02,
    "Other",
)
adult_data["occupation"] = adult_data["occupation"].mask(
    adult_data["occupation"].map(adult_data["occupation"].value_counts(normalize=True))
    < 0.04,
    "Other",
)
adult_data.head()


Unnamed: 0,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Other,<=50K


In [4]:
import dataframe_image as dfi

# Vous pouvez augmenter la résolution (dpi) pour une image plus grande
dfi.export(adult_data.head(), 'adul_data.png', dpi=400,fontsize=40)


In [2]:

#Target
target='income'
#Numerical features
num_features=[col for col in columns if adult_data[col].dtypes in ['int64'] and col not in target]
#Value to replace outliers
outlier_value = -999.001

In [3]:
from scorescanner.preprocessing import (
    multioptbinning,
    outlierdetector,
)
from sklearn.pipeline import Pipeline 

(CVXPY) Feb 19 10:25:18 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.8.3296). Expected < 9.8.0. Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Feb 19 10:25:18 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.8.3296). Expected < 9.8.0. Please open a feature request on cvxpy to enable support for this version.')


In [4]:
# Defining the pipeline steps
pipeline_steps = [
    (
        "outlier_detection",
        outlierdetector(
            columns=num_features,
            method="IQR",
            replacement_method="constant",
            replacement_value=outlier_value,
        ),
    ),
    (
        "optimal_binning",
        multioptbinning(
            variables=num_features,
            target=target,
            target_dtype="binary",
            outlier_value=outlier_value,
        ),
    ),
]

# Creating the pipeline
data_preprocessing_pipeline = Pipeline(steps=pipeline_steps)

# Fitting the pipeline on the data
data_preprocessing_pipeline.fit(adult_data)

# Transforming the data 
adult_data_binned = data_preprocessing_pipeline.transform(adult_data)

#Overview of binned DataFrame
adult_data_binned.head()


Fitting OptimalBinning Models: 100%|██████████| 5/5 [00:00<00:00, 14.28it/s]


Unnamed: 0,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
39,State-gov,"(-inf, 209923.00)",Bachelors,"[12.50, 13.50)",Never-married,Adm-clerical,Not-in-family,White,Male,Special,0.0,"[39.50, 43.50)",United-States,<=50K
50,Self-emp-not-inc,"(-inf, 209923.00)",Bachelors,"[12.50, 13.50)",Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,Special,United-States,<=50K
38,Private,"[209923.00, 242536.50)",HS-grad,"[8.50, 9.50)",Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,"[39.50, 43.50)",United-States,<=50K
53,Private,"[209923.00, 242536.50)",11th,"(-inf, 8.50)",Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,"[39.50, 43.50)",United-States,<=50K
28,Private,"[318978.50, inf)",Bachelors,"[12.50, 13.50)",Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,"[39.50, 43.50)",Other,<=50K


In [5]:

from scorescanner.utils.statistical_metrics import (
    univariate_feature_importance,
    univariate_category_importance,
    calculate_cramers_v_matrix,
    cluster_corr_matrix
)

# Target variable and features list
target = 'income'
features = [col for col in columns if col not in target]

# Calculate univariate feature importance
univariate_importance = univariate_feature_importance(
    df=adult_data_binned, features=features, target_var=target, method="cramerv"
)

# Display the univariate feature importance
univariate_importance



Unnamed: 0,Feature,Univariate_Importance
6,relationship,0.453585
4,marital-status,0.447403
2,education,0.368838
3,education-num,0.363999
5,occupation,0.344285
9,capital-gain,0.266032
8,sex,0.215904
11,hours-per-week,0.182584
0,workclass,0.179208
10,capital-loss,0.138522


In [31]:
import dataframe_image as dfi

# Vous pouvez augmenter la résolution (dpi) pour une image plus grande
dfi.export(
univariate_category_importance(
    df=adult_data_binned, categorical_vars=features, target_var=target
)[0:30].head(), 'category_importance.png', dpi=400,fontsize=40)

In [30]:

univariate_category_importance(
    df=adult_data_binned, categorical_vars=features, target_var=target
)[0:30].head()


Unnamed: 0,Category,Jensen-Shannon Distance,Variable
10,Doctorate,0.36184,education
14,Prof-school,0.356937,education
3,Never-worked,0.302861,workclass
13,Preschool,0.302861,education
8,Without-pay,0.302861,workclass


In [7]:
from scorescanner.utils.plotting import (
    generate_bar_plot,
    plot_woe,
    plot_js,
    plot_corr_matrix
)

In [28]:
import plotly.io as pio

# Générer la figure avec votre fonction personnalisée (assurez-vous que fig est bien une figure Plotly)
fig = generate_bar_plot(
    df=adult_data_binned,
    feature="relationship",
    target_var=target,
    cat_ref=None,
)


# Convertir la figure en image PNG et la sauvegarder directement sur le disque
image_bytes = fig.to_image(format="png", scale=0, width=1200, height=400)

# Écrire l'image dans un fichier
with open("bar_plot.png", "wb") as f:
    f.write(image_bytes)



The reference category:  <=50K
The reference category:  <=50K


In [32]:
import plotly.io as pio

# Supposons que `fig` est votre figure Plotly retournée par la fonction `plot_woe`
fig = plot_woe(
    df=adult_data_binned,
    feature="relationship",
    target_var=target,
    cat_ref=None
)

# Convertir la figure en image PNG et la sauvegarder directement sur le disque
image_bytes = fig.to_image(format="png", scale=0, width=1200, height=400)
# Écrire l'image dans un fichier
with open("woe.png", "wb") as f:
    f.write(image_bytes)


The reference category:  <=50K
The reference category:  <=50K


In [33]:
import plotly.io as pio

# Supposons que `fig` est votre figure Plotly retournée par la fonction `plot_woe`
fig = plot_js(
    df=adult_data_binned,
    feature="relationship",
    target_var= target
    )

# Convertir la figure en image PNG et la sauvegarder directement sur le disque
image_bytes = fig.to_image(format="png", scale=0, width=1200, height=400)
# Écrire l'image dans un fichier
with open("js.png", "wb") as f:
    f.write(image_bytes)

In [34]:
from scorescanner.preprocessing import logisticregressionpreparer

In [35]:

corr_matrix = calculate_cramers_v_matrix(df=adult_data_binned, sampling=False)
corr_matrix_clustered = cluster_corr_matrix(corr_matrix=corr_matrix, threshold=1.7) 
plot_corr_matrix(corr_matrix_clustered)


In [93]:
# Converting features to string
adult_data_binned[features] = adult_data_binned[features].astype(str)
# Dictionary for reference categories
column_dict = {
    "education-num": "(-inf, 8.50)",
    "capital-gain": "0.0",
    "education": " HS-grad",
    
  
    
}
# Initializing the DataPreparerForLogisticRegression
data_preparer = logisticregressionpreparer(
    columns=[col for col in features], column_dict=column_dict
)
# Applying the data preparation steps
prepared_df = data_preparer.fit_transform(adult_data_binned)
#Overview of prepared DataFrame
prepared_df.head()

Unnamed: 0,native-country_Other (vs United-States),"hours-per-week_(-inf, 39.50) (vs [39.50, 43.50))","hours-per-week_Special (vs [39.50, 43.50))","hours-per-week_[43.50, 49.50) (vs [39.50, 43.50))","hours-per-week_[49.50, inf) (vs [39.50, 43.50))",capital-loss_Special (vs 0.0),capital-gain_Special (vs 0.0),sex_ Female (vs Male),race_ Amer-Indian-Eskimo (vs White),race_ Asian-Pac-Islander (vs White),...,"fnlwgt_[318978.50, inf) (vs (-inf, 209923.00))",workclass_ ? (vs State-gov),workclass_ Federal-gov (vs State-gov),workclass_ Local-gov (vs State-gov),workclass_ Never-worked (vs State-gov),workclass_ Private (vs State-gov),workclass_ Self-emp-inc (vs State-gov),workclass_ Self-emp-not-inc (vs State-gov),workclass_ Without-pay (vs State-gov),income
39,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<=50K
50,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,<=50K
38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,<=50K
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,<=50K
28,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,<=50K


In [94]:
import dataframe_image as dfi

# Vous pouvez augmenter la résolution (dpi) pour une image plus grande
dfi.export(
prepared_df.head(), 'prepared_df.png', dpi=400,fontsize=40,max_cols=7)

In [95]:
from scorescanner.feature_selection import variableselector 

In [124]:
# Initializing variableselector class
selector_pearson = variableselector(
    target="income", corr_threshold=0.2, metric="pearson", use_vif=False
)

# Fitting variableselector to data
selector_pearson.fit(prepared_df)

# Selected variables
print("Selected Variables:", selector_pearson.selected_variables)

# Filtering data to selected variables
selected_features_df = selector_pearson.transform(prepared_df)

selected_features_df.head()

Selected Variables: ['marital-status_ Married-civ-spouse (vs  Never-married)', 'education-num_[13.50, inf) (vs (-inf, 8.50))', 'capital-gain_Special (vs 0.0)', 'relationship_ Own-child (vs  Not-in-family)', 'sex_ Female (vs  Male)', 'occupation_ Exec-managerial (vs  Adm-clerical)', 'education-num_[12.50, 13.50) (vs (-inf, 8.50))', 'occupation_ Other-service (vs  Adm-clerical)', 'hours-per-week_[49.50, inf) (vs [39.50, 43.50))', 'workclass_ Self-emp-inc (vs  State-gov)', 'capital-loss_Special (vs 0.0)', 'education-num_[8.50, 9.50) (vs (-inf, 8.50))', 'race_ Black (vs  White)', 'occupation_ Handlers-cleaners (vs  Adm-clerical)', 'education-num_Special (vs (-inf, 8.50))', 'education_ 11th (vs  HS-grad)', 'relationship_ Other-relative (vs  Not-in-family)', 'hours-per-week_[43.50, 49.50) (vs [39.50, 43.50))', 'occupation_ ? (vs  Adm-clerical)', 'workclass_ Private (vs  State-gov)', 'marital-status_ Separated (vs  Never-married)', 'education_ 10th (vs  HS-grad)', 'occupation_ Machine-op-insp

Unnamed: 0,marital-status_ Married-civ-spouse (vs Never-married),"education-num_[13.50, inf) (vs (-inf, 8.50))",capital-gain_Special (vs 0.0),relationship_ Own-child (vs Not-in-family),sex_ Female (vs Male),occupation_ Exec-managerial (vs Adm-clerical),"education-num_[12.50, 13.50) (vs (-inf, 8.50))",occupation_ Other-service (vs Adm-clerical),"hours-per-week_[49.50, inf) (vs [39.50, 43.50))",workclass_ Self-emp-inc (vs State-gov),...,"fnlwgt_[209923.00, 242536.50) (vs (-inf, 209923.00))",occupation_ Craft-repair (vs Adm-clerical),"fnlwgt_Special (vs (-inf, 209923.00))",occupation_Other (vs Adm-clerical),"fnlwgt_[318978.50, inf) (vs (-inf, 209923.00))",workclass_ Without-pay (vs State-gov),race_ Asian-Pac-Islander (vs White),education_ Assoc-voc (vs HS-grad),workclass_ Never-worked (vs State-gov),education_ Assoc-acdm (vs HS-grad)
39,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [125]:
import json
with open("eliminated_variables_info.json", 'r') as file:
    data = json.load(file)
print(json.dumps(data, indent=3))

{
   "marital-status_ Married-civ-spouse (vs  Never-married)": [
      "relationship_ Husband (vs  Not-in-family)"
   ],
   "education-num_[13.50, inf) (vs (-inf, 8.50))": [
      "occupation_ Prof-specialty (vs  Adm-clerical)",
      "education_ Masters (vs  HS-grad)",
      "education_ Prof-school (vs  HS-grad)",
      "education_ Doctorate (vs  HS-grad)"
   ],
   "capital-gain_Special (vs 0.0)": [],
   "relationship_ Own-child (vs  Not-in-family)": [],
   "sex_ Female (vs  Male)": [
      "relationship_ Unmarried (vs  Not-in-family)",
      "marital-status_ Divorced (vs  Never-married)",
      "relationship_ Wife (vs  Not-in-family)"
   ],
   "occupation_ Exec-managerial (vs  Adm-clerical)": [],
   "education-num_[12.50, 13.50) (vs (-inf, 8.50))": [
      "education_ Bachelors (vs  HS-grad)"
   ],
   "occupation_ Other-service (vs  Adm-clerical)": [],
   "hours-per-week_[49.50, inf) (vs [39.50, 43.50))": [],
   "workclass_ Self-emp-inc (vs  State-gov)": [],
   "capital-loss_Special 

In [126]:
import json
from PIL import Image, ImageDraw, ImageFont

# Charger le JSON depuis un fichier
with open("eliminated_variables_info.json", 'r') as file:
    data = json.load(file)

# Convertir le JSON en une chaîne de caractères formatée
json_text = json.dumps(data, indent=2)

# Créer une image de base avec un fond blanc
img = Image.new('RGB', (800, 600), color = (255, 255, 255))

# Initialiser l'objet ImageDraw pour dessiner sur l'image
d = ImageDraw.Draw(img)

# Définir la police et la taille du texte (ajuster le chemin vers une police .ttf si nécessaire)
try:
    font = ImageFont.truetype("arial.ttf", 15)
except IOError:
    font = ImageFont.load_default()

# Dessiner le texte sur l'image (ajuster la position et la largeur du texte selon les besoins)
d.text((10,10), json_text, fill=(0,0,0), font=font)

# Sauvegarder l'image
img.save('json_file.png')


In [127]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    prepared_df[selector_pearson.selected_variables],
    prepared_df["income"],
    test_size=0.1,
    random_state=42,
    stratify=prepared_df["income"],
)

In [128]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver="newton-cholesky", random_state=42)
logreg.fit(X_train, y_train)

In [133]:
from scorescanner.utils.statistical_metrics import logistic_regression_summary

logistic_regression_report = logistic_regression_summary(
    model=logreg,
    X=X_train,
    columns=X_train.columns.tolist(),
    y=y_train,
    intercept=True,
    multi_class=False,
)

logistic_regression_report[0:10]




X does not have valid feature names, but LogisticRegression was fitted with feature names


`product` is deprecated as of NumPy 1.25.0, and will be removed in NumPy 2.0. Please use `prod` instead.



Unnamed: 0,variable,coef,std error,z,P>|z|,[0.025,0.975]
0,intercept,0.04,0.17,-18.74,0.0,0.03,0.06
1,marital-status_ Married-civ-spouse (vs Never-...,9.19,0.04,50.4,0.0,8.43,10.02
2,"education-num_[13.50, inf) (vs (-inf, 8.50))",6.35,0.04,49.9,0.0,5.9,6.83
3,capital-gain_Special (vs 0.0),5.43,0.04,42.69,0.0,5.02,5.87
4,relationship_ Own-child (vs Not-in-family),0.22,0.08,-18.61,0.0,0.18,0.25
15,"education-num_Special (vs (-inf, 8.50))",0.27,0.08,-16.87,0.0,0.23,0.31
11,capital-loss_Special (vs 0.0),3.13,0.03,34.67,0.0,2.93,3.34
7,"education-num_[12.50, 13.50) (vs (-inf, 8.50))",3.09,0.03,41.42,0.0,2.93,3.26
27,education_ 9th (vs HS-grad),0.34,0.18,-6.02,0.0,0.24,0.48
8,occupation_ Other-service (vs Adm-clerical),0.34,0.06,-19.21,0.0,0.3,0.38


In [134]:
import dataframe_image as dfi

# Vous pouvez augmenter la résolution (dpi) pour une image plus grande
dfi.export(

    logistic_regression_report[0:10], 'model_report.png', dpi=400,fontsize=20)