In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
import category_encoders as ce

In [2]:
def delete_regis_vo(df):
    """
    Funcion que elimina los registros que tengan la vo nula
    """
    df = df.dropna(subset=['RainTomorrow'])
    return df

In [3]:
def binary_encoder(df):
    """
    Funcion que codifica la columna RainToday en una codificacion binaria
    """
    encoder = ce.BinaryEncoder(cols=['RainToday'])
    df_decode = encoder.fit_transform(df)
    for col in encoder.get_feature_names_out():
        df.loc[:, col] = df_decode.loc[:, col]
    df.drop(columns= 'RainToday', inplace=True)
    return df

In [4]:
def outliers_replace(df):
    """
    Este metodo busca los outliers de las columnas numericas con el metodo del rango intercurtilico, en caso de que algun valor
    sea detectado como un outlier se reemplaza este valor por el percentil 25 o 75 segun sea el caso.
    """
    for columna in [var for var in df.columns if df[var].dtype!='O']:
        IQR = df[columna].quantile(0.75) - df[columna].quantile(0.25)
        rango_inferior = df[columna].quantile(0.25) - (IQR * 1.5)
        rango_superior = df[columna].quantile(0.75) + (IQR * 1.5)
        df.loc[df[columna] > rango_superior, columna] = rango_superior
        df.loc[df[columna] < rango_inferior, columna] = rango_inferior
    return df

In [5]:
def one_hot_encoder(df):
    """
    Funcion que permite crear las varibles dummies en codificacion onehot
    """
    ohe = OneHotEncoder()
    cat = [columna for columna in df.columns if columna not in ['RainToday', 'RainTomorrow'] and df[columna].dtype == 'O']
    num = [var for var in df.columns if df[var].dtype!='O']
    binary = ['RainToday', 'RainTomorrow']
    features = ohe.fit_transform(df[cat]).toarray()
    features_label = ohe.categories_
    labels = []
    for i in range(len(features_label)):
        label_list = np.array(features_label[i]).tolist()
        label_name = ["dummie_" + str(cat[i]) + "_" + str(elemento) for elemento in label_list]
        labels.extend(label_name)
    dummies = pd.DataFrame(features, columns = labels)
    df = pd.concat([df[num],df[binary],dummies], axis=1)
    df_sin_nans = df.copy().filter(regex='_nan$', axis=1).columns
    df.drop(columns=df_sin_nans, inplace=True)
    df.drop_duplicates(inplace=True)
    return df

In [6]:
def vo_encoder(df):
    """
    Funcion que codifica la variable objetivo
    """
    df['RainTomorrow'] = df['RainTomorrow'].replace({'Yes': 1, 'No': 0})
    columna_extraida = df.pop('RainTomorrow')
    df['RainTomorrow'] = columna_extraida
    return df

In [7]:
def imputer_vars(df):
    num = [var for var in df.columns if df[var].dtype!='O']
    df[num] = df[num].fillna(df[num].median())
    return df

In [8]:
data = pd.read_csv(r"D:\Prueba_ML\weatherAUS.csv")
data['Date'] = pd.to_datetime(data['Date'])
data = data.assign(month=data['Date'].dt.month, year=data['Date'].dt.year, day=data['Date'].dt.day).drop('Date', axis=1)

In [9]:
custom_transformer = FunctionTransformer(delete_regis_vo)
custom_transformer2 = FunctionTransformer(outliers_replace)
custom_transformer3 = FunctionTransformer(one_hot_encoder)
custom_transformer4 = FunctionTransformer(binary_encoder)
custom_transformer5 = FunctionTransformer(vo_encoder)
custom_transformer6 = FunctionTransformer(imputer_vars)
#imputer = SimpleImputer(strategy='median')

preprocesing = Pipeline(steps=[
    ("delete", custom_transformer),
    ("outliers", custom_transformer2),
    ("oneHot", custom_transformer3),
    ("binary", custom_transformer4),
    ("vo", custom_transformer5),
    ("imputer", custom_transformer6)
    #("imputer", imputer)
])
#pipeline = make_pipeline(preprocessor)
preprocesing.fit(data)
data_out = preprocesing.transform(data)
print(data_out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


        MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
0          13.4     22.9       0.6          4.8       8.5           44.0   
1           7.4     25.1       0.0          4.8       8.5           44.0   
2          12.9     25.7       0.0          4.8       8.5           46.0   
3           9.2     28.0       0.0          4.8       8.5           24.0   
4          17.5     32.3       1.0          4.8       8.5           41.0   
...         ...      ...       ...          ...       ...            ...   
145454      3.5     21.8       0.0          4.8       8.5           31.0   
145455      2.8     23.4       0.0          4.8       8.5           31.0   
145456      3.6     25.3       0.0          4.8       8.5           22.0   
145457      5.4     26.9       0.0          4.8       8.5           37.0   
145458      7.8     27.0       0.0          4.8       8.5           28.0   

        WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...  \
0               20

In [10]:
class Controls():
    def __init__(self, df):
        self.df = df
        print(self.__validate_duplicate())
        print(self.__null_values())
        print(self.__num_columns())
        print(self.__otuliers())
        print(self.__cardinalidad())

    def __validate_duplicate(self):
        registros_duplicados = self.df.duplicated().sum()
        return "la cantidad de registros duplicados es {}".format(registros_duplicados)
    def __null_values(self):
        cadena = ""
        porc_nulidad = 0
        for columna in self.df.columns:
            suma_nulos_columna = self.df[columna].isnull().sum()
            porcentaje_nulos_columna = (suma_nulos_columna / len(self.df)) * 100
            porc_nulidad = porcentaje_nulos_columna + porc_nulidad
            cadena = "Porcentaje de valores nulos en la columna {} es {} \n".format(columna, porcentaje_nulos_columna) + cadena
        if porc_nulidad == 0:
            text = "la nulidad es 0"
        else:
            text = cadena + "\ncontrol fallido"
        return text
    def __num_columns(self):
        column = self.df.shape[1]
        return "El numero de columnas del dataframe es {}".format(column)
    def __otuliers(self):
        text = ""
        for column in self.df.columns:
            if(self.df[column].max() == 1 or self.df[column].min() == 0):
                pass
            else:
                IQR = self.df[column].quantile(0.75) - self.df[column].quantile(0.25)
                rango_inferior = self.df[column].quantile(0.25) - (IQR * 1.5)
                rango_superior = self.df[column].quantile(0.75) + (IQR * 1.5)
                if self.df[column].max() > rango_superior or self.df[column].min() < rango_inferior:
                    text = "El limite superior para la variable {} es {} el inferior es {} se presentan outliers, valor mayor {}, valor menor {} \n".format(column, rango_superior, rango_inferior, self.df[column].max(), self.df[column].max() ) + text
        if text == "":
            return "No hay outliers"
        else:
            return text
    def __cardinalidad(self):
        print("Cardinalidad por vaiable")
        text = ""
        for columna in self.df.columns: #cardinalidad de variables
            text = 'columna {}: {} \n'.format(columna, self.df[columna].nunique()) + text
        return text

In [11]:
Controls(data_out)

la cantidad de registros duplicados es 0
la nulidad es 0
El numero de columnas del dataframe es 119
El limite superior para la variable Temp3pm es 40.3 el inferior es 2.700000000000003 se presentan outliers, valor mayor 41.099999999999994, valor menor 41.099999999999994 
El limite superior para la variable Temp9am es 34.89999999999999 el inferior es -1.099999999999996 se presentan outliers, valor mayor 35.550000000000004, valor menor 35.550000000000004 
El limite superior para la variable Pressure3pm es 1031.6 el inferior es 998.8000000000002 se presentan outliers, valor mayor 1034.4, valor menor 1034.4 
El limite superior para la variable Pressure9am es 1033.8500000000001 el inferior es 1001.45 se presentan outliers, valor mayor 1036.65, valor menor 1036.65 
El limite superior para la variable WindGustSpeed es 68.5 el inferior es 8.5 se presentan outliers, valor mayor 73.5, valor menor 73.5 
El limite superior para la variable MaxTemp es 43.25 el inferior es 2.849999999999998 se prese

<__main__.Controls at 0x2c6bacf3a90>

Model, la metrica que usaremos sera la precision de la predicion

In [12]:
#Separacion de variable objetivo
x = data_out.drop(['RainTomorrow'], axis=1)
y = data_out['RainTomorrow']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import time
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
import pickle
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [17]:
#escalado de datos
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [18]:
models = [
    ('RandomForest', RandomForestClassifier()),
    ('logreg', LogisticRegression())
]
for model_name, model in models:
    print(f"Entrenando {model_name}")
    start_time = time.time()
    model.fit(x_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Precisión en el conjunto de prueba para {model_name}: {accuracy:.4f}, tiempo de ejecucion: {training_time}\n")

Entrenando RandomForest
Precisión en el conjunto de prueba para RandomForest: 0.8592, tiempo de ejecucion: 57.61011528968811

Entrenando logreg
Precisión en el conjunto de prueba para logreg: 0.8529, tiempo de ejecucion: 4.378999948501587



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Nos quedaremos con la regresion, ambos algoritmos tienen una presicion similar. Ahora implemetaremos una malla de hiperparametros

In [89]:
param_grid = {
    'C': [1, 10, 100, 500],  # Parámetro de regularización
    'random_state': [0],
    'penalty': ['l2'],
    'max_iter': [500],
    'solver': ['liblinear','lbfgs'],     # Método de solución
}
logreg = LogisticRegression()
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=3)
grid_search.fit(x_train, y_train)
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor puntaje de precisión:", grid_search.best_score_)
report = classification_report(y_train, y_pred_train)

Mejores hiperparámetros: {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'liblinear'}
Mejor puntaje de precisión: 0.8479762035860564


In [90]:
results = grid_search.cv_results_
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print("Parámetros:", params)
    print("Puntaje medio:", mean_score)
    print()

Parámetros: {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'liblinear'}
Puntaje medio: 0.8479762035860564

Parámetros: {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}
Puntaje medio: 0.8477860542638175

Parámetros: {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'liblinear'}
Puntaje medio: 0.8479762022414055

Parámetros: {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}
Puntaje medio: 0.8479675590496476

Parámetros: {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'liblinear'}
Puntaje medio: 0.847881128028503

Parámetros: {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}
Puntaje medio: 0.8478811284767199

Parámetros: {'C': 500, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solver': 'liblinear'}
Puntaje medio: 0.847881128028503

Parámetros: {'C': 500, 'max_iter': 500, 'penalty': 'l2', 'random_state': 0, 'solve

Al comparar los resultados de la malla de hiperparametros, se evidencia que el modelo se desempeña mejor con los hiperparametros por default solo se aumentara el numero de iteraciones

In [92]:
logreg = LogisticRegression(max_iter = 500, C = 1, solver = 'liblinear', random_state = 0)

In [93]:
logreg.fit(x_train, y_train)

LogisticRegression(C=1, max_iter=500, random_state=0, solver='liblinear')

In [94]:
y_pred_test = logreg.predict(x_test)
y_pred_test

array([0., 0., 0., ..., 0., 0., 0.])

In [95]:
y_pred_train = logreg.predict(x_train)
y_pred_train

array([0., 0., 0., ..., 0., 0., 0.])

In [106]:
precision_test = accuracy_score(y_test, y_pred_test)
recall_test = accuracy_score(y_test[y_test == 1], y_pred_test[y_test == 1])
precision_train = accuracy_score(y_train, y_pred_train)
recall_train = accuracy_score(y_train[y_train == 1], y_pred_train[y_train == 1])

# Calcular AUC
auc_test = roc_auc_score(y_test, y_pred_test)
auc_train = roc_auc_score(y_train, y_pred_train)

# Crear el DataFrame
df = pd.DataFrame({
    'Metric': ['Precision model_test', 'Recall_test', 'Precision model_train', 'Recall_train', 'Auc_test', 'Auc_train'],
    'Value': [precision_test, recall_test, precision_train, recall_train, auc_test, auc_train]
})

df.to_excel("metrics_model.xlsx", index = False)

El modelo no se ve con overfiting

In [72]:
from sklearn.metrics import classification_report
report = classification_report(y_train, y_pred_train)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     90128
         1.0       0.73      0.50      0.59     25571

    accuracy                           0.85    115699
   macro avg       0.80      0.72      0.75    115699
weighted avg       0.84      0.85      0.84    115699



In [82]:
cm = confusion_matrix(y_test, y_pred_test)
print(cm)

[[21457  1162]
 [ 3094  3212]]


 #Conclusiones
 . El modelo presenta una buena capacidad de predecir los 0 pero no tanto para los 1.
 . El modelo tiene una roc aceptablemente buena.
 . La buenas metricas de precision son gracias a la buena capacidad de predecir los ceros
 . Se puede tratar de mover el umbral de decision si lo que se quiere es predecir mas 1 que 0.

In [100]:
import pickle
with open('RainModel.pkl', 'wb') as f:
    pickle.dump(logreg, f)