# 0. Imports libraries

In [None]:
#Essentials
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import json
from pickle import dump
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.metrics import accuracy_score

#Model
#??????

#Extras
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

---

---

# 1. Problem statement and data collection

## 1.1 Description of the problem

## 1.2 Data collection

In [None]:
path = ""

df_download = pd.read_csv(path, sep=";")
df_download.to_csv("../data/rawdata_name.csv", index=False)

df = pd.read_csv("../data/raw/data_name.csv")
df.head()

## 1.3 Understanding the features

In [None]:
#Columns
df.columns

## 1.4 Data exploration

In [None]:
print(f'Our dataframe contains {len(df)} rows and it has {df.shape[1]} features.')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()[df.isna().sum()>0]

***Conclusions:***

## 1.5 Eliminating irrelevant information

---

---

# 2. Univariated Analysis

## 2.1 Dividing out dataset into categorical and numerical

In [2]:
df.dtypes.unique()

NameError: name 'df' is not defined

In [None]:
num = ["int64", "float64"]
cat = ["O"]

#Nuemrical df
df_num = df.select_dtypes(num)
#Categorical df
df_cat = df.select_dtypes(cat)

In [None]:
df_num.head(3)

In [None]:
df_cat.head(3)

## 2.2 Categorical variable analysis

In [None]:
df_cat.head(3)

In [None]:
fig, ax = plt.subplots(nrows=??,ncols=??,figsize=(??,??))

sns.countplot(ax= ax[0,0], data=df_cat, x="??", order=df_cat["??"].value_counts().index, hue="??")
#...

plt.tight_layout()
plt.show()

***Conclusions:***

## 2.3 Numerical variable analysis

In [None]:
df_num.head(3)

In [None]:
fig, ax = plt.subplots(??, ??, figsize=(10,12), gridspec_kw={"height_ratios":[6,1,6,1,6,1,6,1,6,1]})

#Row_1
sns.histplot(ax = ax[0,0], data=df_num[df_num["??"]], x="??")
sns.boxplot(ax = ax[1,0], data=df_num, x="??")
#...

plt.tight_layout()
fig.subplots_adjust(hspace=1)
plt.show()

***Conclusions:***

## 2.4 Multivariate analysis

### 2.4.1 Analysis Categorical - Categorical

In [None]:
fig, axis = plt.subplots(2, 4, figsize = (16, 14))
fig.suptitle("Análisis categórico-categórico", fontsize=16)

sns.countplot(ax = axis[0, 0], data = df_cat, x="??", hue = "TARGET").set(xlabel= None)
axis[0, 0].set_xticklabels(axis[0, 0].get_xticklabels(), rotation=90, fontsize=8)
#...

axis[0, 0].set_title("??", fontsize=14, fontweight='??')
...

***Conclusions:***

### 2.4.2 Encoding Categorical Values and saving in JSON files

In [None]:
df_enc = df.copy()

In [None]:
# Creating encoders for categorical features and saving them as JSON files. All files prefixed with 'enc'
# contain the encoding dictionaries for each categorical feature.
for column in df_cat.columns:
    unique_values = list(df_cat[column].unique())
    globals()[f"{column}_enc"] = dict(zip(unique_values, range(len(unique_values))))

    json.dump(globals()[f"{column}_enc"], open(f'../data/interim/enc_{column}.json', 'w'))

In [None]:
# Replacing the values in our categorical features to our encoded values (numerical)
for column in df_cat.columns:
    df_enc[column] = df_enc[column].map(json.load(open(f'../data/interim/enc_{column}.json')))

In [None]:
df_enc.head()

### 2.4.3 Analysis Numerical - Numerical

In [None]:
fig, axis = plt.subplots(??, ??, figsize = (14, 10))

sns.regplot(ax = axis[0, 0], data = df_enc, x = "??", y = "TARGET")
sns.heatmap(df_enc[["TARGET", "??"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)
#...

plt.tight_layout()
plt.show()

### 2.4.4 Numerical - Categorical analysis (Correlational Analysis)

In [None]:
sns.pairplot(data=df_enc)
plt.savefig("Num_cat_corr.png")
plt.show()

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(data=df_enc.corr().round(2), annot=True, square= True, cmap="RdBu", mask=np.triu(df_enc.corr()))
plt.savefig("heat_map_corr.png")
plt.show()

***Conclusions:***

---

---

# 3. Feature Engineering

## 3.1 New feature inference

## 3.2 Outlier analysis

In [None]:

df_enc.describe().round(2)

In [None]:
continuos_f = list(df_enc.select_dtypes('float64'))

fig, ax = plt.subplots(1,2, figsize=(10, 4))

col=0
for each in continuos_f:
    sns.boxplot(ax = ax[col], data = df_enc, x=each)
    col += 1

plt.tight_layout()
plt.show()

In [None]:
df_enc_no = df_enc.copy()

In [None]:
def remove_outliers(x, feature_name, allow_neg=True):
    q1, q3 = x.quantile([0.25, 0.75])
    iqr = q3 - q1
    upper_lim = q3 + (iqr*1.5)
    lower_lim = q1 - (iqr*1.5) if allow_neg else max(0, q1 - (iqr * 1.5))

    x = x.apply(lambda x: upper_lim if (x > upper_lim) else (lower_lim if (x < lower_lim) else x))

    filename = f'../data/interim/outliers_lims_{feature_name}.json'
    json.dump({'upper_lim': upper_lim, 'lower_lim': lower_lim}, open(filename, 'w'))

    return x

In [None]:
f_outliers_with_neg = []
f_outliers_no_neg = []

for each in continuos_f:
    if df_enc[each].min() < 0:
        f_outliers_with_neg.append(each)
    f_outliers_no_neg.append(each)

for feature in f_outliers_with_neg:
    df_enc_no[feature] = remove_outliers(df_enc_no[feature], feature)

for feature in f_outliers_no_neg:
    df_enc_no[feature] = remove_outliers(df_enc_no[feature], feature, allow_neg=False)

## 3.3 Check missing values

## 3.4 Split train/test of both Dataframe

In [None]:
def split(target, df, test_size=0.2, random_state=123):
    X = df.drop(columns=target)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
X_train_with_outliers, X_test_with_outliers, y_train, y_test = split('Outcome', df_enc)
X_train_without_outliers, X_test_without_outliers, _, _ = split('Outcome', df_enc_no)

X_train_with_outliers.to_csv('../data/processed/X_train_with_outliers.csv', index=False)
X_test_with_outliers.to_csv('../data/processed/X_test_with_outliers.csv', index=False)
X_train_without_outliers.to_csv('../data/processed/X_train_without_outliers.csv', index=False)
X_test_without_outliers.to_csv('../data/processed/X_test_without_outliers.csv', index=False)

y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

In [None]:
X_test_with_outliers.head()

## 3.5 Normalization

In [None]:
def norm(X_train, X_test, reference: str):
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train_norm = scaler.transform(X_train)
    X_train_norm = pd.DataFrame(X_train_norm, index=X_train.index, columns=X_train.columns)

    X_test_norm = scaler.transform(X_test)
    X_test_norm = pd.DataFrame(X_test_norm, index=X_test.index, columns=X_test.columns)

    dump(scaler, open(f'../data/processed/normalized_{reference}.sav', 'wb'))

    return X_train_norm, X_test_norm

In [None]:
X_train_with_outliers_norm, X_test_with_outliers_norm = norm(X_train_with_outliers, X_test_with_outliers, 'with_outliers')
X_train_without_outliers_norm, X_test_without_outliers_norm = norm(X_train_without_outliers, X_test_without_outliers, 'without_outliers')

## 3.6 Min-Max Scaling

In [None]:
def minmax(X_train, X_test, reference: str):
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    X_train_minmax = scaler.transform(X_train)
    X_train_minmax = pd.DataFrame(X_train_minmax, index = X_train.index, columns=X_train.columns)

    X_test_minmax = scaler.transform(X_test)
    X_test_minmax = pd.DataFrame(X_test_minmax, index = X_test.index, columns=X_test.columns)
    
    dump(scaler, open(f'../data/processed/minmax_{reference}.sav', 'wb'))

    return X_train_minmax, X_test_minmax

In [None]:
X_train_with_outliers_minmax, X_test_with_outliers_minmax = minmax(X_train_with_outliers, X_test_with_outliers, 'with_outliers')
X_train_without_outliers_minmax, X_test_without_outliers_minmax = minmax(X_train_without_outliers, X_test_without_outliers, 'without_outliers')

## 3.7 Feature selection

### 3.7.1 Kselection

In [None]:
def kselection(X_train, X_test, y_train, k, reference: str):
    selection_model = SelectKBest(f_classif, k=k)
    selection_model.fit(X_train, y_train)
    cols = selection_model.get_support()

    X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns=X_train.columns.values[cols]) 
    X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns=X_test.columns.values[cols])

    dump(selection_model, open(f'../data/processed/selection_model_{reference}.sav', 'wb'))

    return X_train_sel, X_test_sel

In [None]:
X_train_with_outliers_sel, X_test_with_outliers_sel = kselection(X_train_with_outliers, X_test_with_outliers, y_train, 'all', 'with_outliers')
X_train_without_outliers_sel, X_test_without_outliers_sel = kselection(X_train_without_outliers, X_test_without_outliers, y_train, 'all', 'without_outliers')

### 3.7.2 Applying the columns filter (kselection) to the normalized and minmax scaled data frames

In [None]:
X_train_with_outliers_norm_sel = X_train_with_outliers_norm[X_train_with_outliers_sel.columns]
X_train_without_outliers_norm_sel = X_train_without_outliers_norm[X_train_with_outliers_sel.columns]

X_test_with_outliers_norm_sel = X_test_with_outliers_norm[X_test_with_outliers_sel.columns]
X_test_without_outliers_norm_sel = X_test_without_outliers_norm[X_test_with_outliers_sel.columns]

X_train_with_outliers_minmax_sel = X_train_with_outliers_minmax[X_train_with_outliers_sel.columns]
X_train_without_outliers_minmax_sel = X_train_without_outliers_minmax[X_train_without_outliers_sel.columns]

X_test_with_outliers_minmax_sel = X_test_with_outliers_minmax[X_test_with_outliers_sel.columns]
X_test_without_outliers_minmax_sel = X_test_without_outliers_minmax[X_test_without_outliers_sel.columns]

3.7.2 Saving the CSV files

In [None]:
dfs_train = {
    'X_train_with_outliers_sel': X_train_with_outliers_sel,
    'X_train_without_outliers_sel': X_train_without_outliers_sel,
    'X_train_with_outliers_norm_sel': X_train_with_outliers_norm_sel,
    'X_train_without_outliers_norm_sel': X_train_without_outliers_norm_sel,
    'X_train_with_outliers_minmax_sel': X_train_with_outliers_minmax_sel,
    'X_train_without_outliers_minmax_sel': X_train_without_outliers_minmax_sel 
}

dfs_test = {
    'X_test_with_outliers_sel': X_test_with_outliers_sel,
    'X_test_without_outliers_sel': X_test_without_outliers_sel,
    'X_test_with_outliers_norm_sel': X_test_with_outliers_norm_sel,
    'X_test_without_outliers_norm_sel': X_test_without_outliers_norm_sel,
    'X_test_with_outliers_minmax_sel': X_test_with_outliers_minmax_sel,
    'X_test_without_outliers_minmax_sel': X_test_without_outliers_minmax_sel    
}

for name, df in dfs_train.items():
    df.to_csv(f"../data/processed/{name}.csv", index=False)

for name, df in dfs_test.items(): 
    df.to_csv(f'../data/processed/{name}.csv', index=False)

---

---

# 4. Machine Learning

In [None]:
train, test = [], []

for name, df in dfs_train.items():
    train.append(df)
for name, df in dfs_test.items():
    test.append(df)

## 4.1 {Model}

In [None]:
results = []

for index in range(len(train)):
    model = "????????"
    train_df = train[index]
    model.fit(train_df, y_train)
    y_test_pred = model.predict(test[index])

    results.append(
        {
            'index': index,
            'df_train': list(dfs_train.keys())[index],
            'Accuracy Score': round(accuracy_score(y_test, y_test_pred),4)         
        }
    )

results = sorted(results, key=lambda x: x['Accuracy Score'], reverse=True)
best_ind = results[0]['index']
best_df_train = results[0]['df_train']
results[0]

In [None]:
print(f'Our best dataframe for our model is {best_df_train}, with an Accuracy Score of {results[0]["Accuracy Score"]}')

## 4.2 Optimization

### 4.2.1 Grid search

Find the best hyperparameters

In [None]:
hyperparams = {
    "?????":"?????"
}
model = "??????"
model.fit(dfs_train.get(best_df_train), y_train)
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 3)
grid.fit(dfs_train.get(best_df_train), y_train)
print(f'The best hyperparameters are: {grid.best_params_}')

Try the best hyperparameters

In [None]:
model_grid = "MODEL(HYPERPARAMETERS)"
model_grid.fit(dfs_train.get(best_df_train), y_train)
y_pred = model_grid.predict(dfs_test.get(list(dfs_test)[best_ind]))
model_grid_accuracy = round(accuracy_score(y_test, y_pred),4)
print(f'The model accuracy with the hyperparameters is: {model_grid_accuracy*100}%, an increase of {round(model_grid_accuracy-(results[0]["Accuracy Score"]),4)*100}% vs the default model')