In [1]:
# Step 0. Import libraries, custom modules and logging
# Basics ---------------------------------------------------------------
import logging
import joblib
# Data -----------------------------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
# Machine learning -----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (accuracy_score, 
                             f1_score,
                             ConfusionMatrixDisplay,
                             RocCurveDisplay,
                             DetCurveDisplay)
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance
# Logging initialization -----------------------------------------------
logger = logging.getLogger()
logger.setLevel(logging.INFO)

NameError: name 'logging' is not defined

In [None]:
# Step 1. Load data and get a first view -------------------------------
# 1.1 Read from source and get basic info
df_raw = pd.read_csv('heights.csv')
df_raw.info()

In [None]:
# 1.2 Get a sample
display(df_raw.sample(5, random_state=66))

In [5]:
# Step 2. Prepara the dataset for analysis
# 2.1 Make transformations using a pipeline
df_interim = (
    df_raw
    .copy()
    .set_axis(
        df_raw.columns.str.replace(' ','_')
        .str.replace(r'\W','',regex=True)
        .str.lower()
        .str.slice(0,40), axis=1
    )
    .rename(columns={'sex':'target'})
    .astype({'target': "category"})
    .iloc[:,1:3]
)

In [None]:
# 2.2 Create the final dataset with the target in front, show result
df = (
    df_interim
    .copy()
    .reindex(
        columns=(
            ['target'] + 
            [c for c in df_interim.columns.to_list() if c not in ['target']]
        )
    )
)
df.info()

In [None]:
# Step 3. Perform EDA
# 3.1 Split the dataset
df_train, df_test = train_test_split(df, 
                                     random_state=2024, 
                                     test_size=0.2,
                                     stratify=df["target"])
df_train.info()


In [None]:
display(df_train.describe().T)
display(df_train.describe(include= "category").T)

In [None]:
table = pd.concat(
    [df_train["target"].value_counts(),
     df_train["Target"].value_counts(normalize=True)], axis = 1

)
display.(table)
print(f"Sum: {len(df_train["target"])}")

In [None]:
df_train.hist()

plt.show()

In [None]:
# 3.4 Perform univariate analysis on numerical variables using kde
fig, ax = plt.subplots(1,2, figsize=(8,4))
sns.kdeplot(data=df_train, x='height', ax=ax)
ax.axvline(x=df_train['height'].mean(), color='k', linestyle='--')
ax.axvline(x=df_train['height'].median(), color='g', linestyle='--')

ax.grid(True)
plt.show()

In [None]:
fig, ax = plt.subplot(figsize=(4,4))
sns.countplot(data = df_train, y="target", ax= ax)
plt.show()

In [None]:
sns.pairplot(data= df_train, diag_kind="kde", hue="target")
plt.show()

In [13]:
# Step 4. Experiment with models
# 4.1 Split features and reponse
X_train = df_train.drop('target', axis=1).reset_index(drop=True) # Independent variable
y_train = df_train['target'].reset_index(drop=True)
X_test = df_test.drop('target', axis=1).reset_index(drop=True)
y_test = df_test['target'].reset_index(drop=True) # Dependent variable

In [None]:
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [None]:
# 4.2 Simple model
# 4.1 Create a naive model, by guessing the outcome, then evaluate
y_naive = np.random.choice(
    df_train['target'].unique().tolist(), size=len(df_train)
)
y_naive_enc = label_encoder.transform(y_naive)
print(f'Accuracy: {accuracy_score(y_train_enc, y_naive_enc):.2f}')

In [None]:
df_train.groupby("target", observed=False)["height"].agg(["mean", "std"])

In [None]:
y_interval = (
    df_train["height"].apply(lambda x: "Male" if x >= 62 else "Female").tolist()
)
y_interval_enc = label_encoder.transform(y_interval)
print(f'Accuracy: {accuracy_score(y_train_enc, y_interval_enc):.3f}')
print(f'f1 score: {f1_score(y_train_enc, y_interval_enc):.3f}')

In [None]:
my_table = pd.DataFrame({"cutoff": [], "accuaracy":[]})
for i in range(62,76):
    y_temp = (
         df_train["height"].apply(lambda x: "Male" if x >= i else "Female").tolist()
    )
    y_temp_enc = label_encoder.transform(y_temp)
    metric = round(accuracy_score(y_train_enc, y_interval_enc), 2)
    new_row = pd.DataFrame({"cutoff": [i], "accuaracy":[metric]})
    my_table = pd.concat([new_row, my_table])

my_table.reset_index(drop=True)

In [None]:
y_cutoff =(
    df_train["height"].apply(lambda x : "Male" if x >= 64 else "Female").tolist()
)

pred = pd.DataFrame({"pred": y_cutoff})
display(pd.crosstab(y_train, pred["pred"]))
display(pd.crosstab(y_train, pred["pred"], normalize="index"))

In [None]:
clf_log_reg = LogisticRegression(class_weight="balanced")
clf_log_reg.fit(X_train, y_train_enc)

In [None]:
y_pred = clf_log_reg.predict(X_test)

In [None]:
y_pred

In [None]:
print(f'Accuracy: {accuracy_score(y_test_enc, y_pred):.3f}')

In [None]:
pred = pd.DataFrame({"pred": y_pred})
display(pd.crosstab(y_train, pred["pred"]))
display(pd.crosstab(y_train, pred["pred"], normalize="index"))

In [15]:
reg_lin = LinearRegression()
reg_lin.fit(X_train, y_train)
y_pred = reg_lin.predict(X_test)

In [None]:
print(f"MSE: {mean_squared_error(y_test, y_pred)}") # mean squuared error
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}") 
print(f"MAE: {mean_absolute_error(y_test, y_pred)}") # mean absolute error
print(f"R2: {r2_score(y_test, y_pred)}") 

In [None]:
plt.scatter(x= y_test, y= y_pred, c='k')
plt.plot([70,230], [70,230], c='r')
plt.axis('equal')
plt.xlabel('Real')
plt.ylabel('Predicted')
plt.show()

In [None]:
resid = y_test - y_pred
plt.scatter(x= y_pred, y= resid)
plt.axhline(0, linestyle= "--")
plt.xlabel('fitted values')
plt.ylabel('residuals')
plt.show()

In [None]:
# Model
print(f"mort = {reg_lin.intercept_:.1f} {reg_lin.coef_[0]:1f}*lat")

In [None]:
# Step 1. Load data and get a first view -------------------------------
# 1.1 Read from source and get basic info
df_raw_1 = pd.read_csv('skincancer.csv')
df_raw_1.info()

In [21]:
# Step 2. Prepara the dataset for analysis
# 2.1 Make transformations using a pipeline
df_interim_1 = (
    df_raw_1
    .copy()
    .set_axis(
        df_raw_1.columns.str.replace(' ','_')
        .str.replace(r'\W','',regex=True)
        .str.lower()
        .str.slice(0,40), axis=1
    )
    .rename(columns={'mort':'target'})
    .astype({'target': np.float16})
    .iloc[:,1:]
)

In [None]:
# 2.2 Create the final dataset with the target in front, show result
df_1 = (
    df_interim_1
    .copy()
    .reindex(
        columns=(
            ['target'] + 
            [c for c in df_interim_1.columns.to_list() if c not in ['target']]
        )
    )
)
df_1.info()

In [None]:
# Step 3. Perform EDA
# 3.1 Split the dataset
df_trai, df_tes = train_test_split(df_1, 
                                     random_state=2024, 
                                     test_size=0.2)
df_trai = df_trai.reset_index(drop=True).sort_values(by='target')
df_trai.info()

In [None]:
display(df_1.describe().T)

In [None]:
df_trai.hist()
plt.show()

In [None]:
sns.pairplot(data= df_trai, diag_kind="kde")
plt.show()

In [27]:
# Step 4. Experiment with models
# 4.1 Split features and reponse
X_trai = df_trai.drop('target', axis=1).reset_index(drop=True) # Independent variable
y_trai = df_trai['target'].reset_index(drop=True)
X_tes = df_tes.drop('target', axis=1).reset_index(drop=True)
y_tes = df_tes['target'].reset_index(drop=True) # Dependent variable

In [28]:
reg_mlin = LinearRegression()
reg_mlin.fit(X_trai, y_trai)
y_mpred = reg_mlin.predict(X_tes)

In [None]:
print(f"MSE: {mean_squared_error(y_tes, y_mpred)}") # mean squuared error
print(f"RMSE: {np.sqrt(mean_squared_error(y_tes, y_mpred))}") 
print(f"MAE: {mean_absolute_error(y_tes, y_mpred)}") # mean absolute error
print(f"R2: {r2_score(y_tes, y_mpred)}") 

# R2 explains what percentage of the predicted actual variance

In [None]:
# Explain model
importance = permutation_importance(reg_mlin, X_trai, y_trai)
importance_df = pd.DataFrame(importance.importances.T, columns= X_trai.columns)
ax = importance_df.plot.box(vert = False)
ax.axvline(x= 0, color= 'k', linestyle= '--')
ax.set_title('Permutation importance (train set)')
ax.grid(True)
plt.show()

In [None]:
# Model
print(f"mort = {reg_mlin.intercept_:.1f} {reg_mlin.coef_[0]:1f}*lat +{reg_mlin.coef_[1]:1f}*ocean {reg_mlin.coef_[2]:1f}*long")