#PREPROCESAMIENTO
En este notebook, desarrollamos dos bloques de preprocesamientos aplicados a un df.
Para luego ser volcados en la etapa de evaluacion de modelo.

In [11]:
import requests
from io import StringIO
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [12]:
# Extraccion de df desde google drivee.
file_id = "16ypxCIBr9wSGVEaXqWdZUfz9w4xzccwo"
download_link = f"https://drive.google.com/uc?id={file_id}"
response = requests.get(download_link)
csv_data = StringIO(response.text)
# transformamos en df.
df = pd.read_csv(csv_data, encoding='utf-8')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2666 non-null   

In [14]:
# Creamos copias
df1 = df.copy()
df2 = df.copy()

##01-Preprocesamiento
En este bloque, desarrollamos la PRIMERA opcion de preprocesamiento donde abarca las siguientes tareas:
- Transformacion de tipos de dato.
- Eliminacion de variable "State".
- Eliminacion de outliers.
- Normalizacion.



In [15]:
# 01-Transformamos de bool a int.
df1['Churn'] = df1['Churn'].astype(int)


# 02-Transformamos las variables categoricas a int.
df1['International plan'] = df1['International plan'].map({'Yes': 1, 'No': 0})
df1['Voice mail plan'] = df1['Voice mail plan'].map({'Yes': 1, 'No': 0})


# 03-Eliminamos variable categorica "State".
df1 = df1.drop('State', axis=1)


# 04-Eliminacion de outliers.
def eliminar_filas_con_outliers(dataframe, columns):
    df_copy = dataframe.copy()
    for column in columns:
        q1 = df_copy[column].quantile(0.25)
        q3 = df_copy[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df_copy = df_copy[(df_copy[column] >= lower_bound) & (df_copy[column] <= upper_bound)]
    return df_copy
df1=eliminar_filas_con_outliers(df1,['Account length', 'Area code',
       'Number vmail messages', 'Total day minutes', 'Total day calls',
       'Total day charge', 'Total eve minutes', 'Total eve calls',
       'Total eve charge', 'Total night minutes', 'Total night calls',
       'Total night charge', 'Total intl minutes', 'Total intl calls',
       'Total intl charge', 'Customer service calls'])


# 05-Normalizacion.
def normalizar_variables(dataframe, columns):
    df_copy = dataframe.copy()
    scaler = MinMaxScaler()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy
df1=normalizar_variables(df1,['Account length', 'Area code',
       'Number vmail messages', 'Total day minutes', 'Total day calls',
       'Total day charge', 'Total eve minutes', 'Total eve calls',
       'Total eve charge', 'Total night minutes', 'Total night calls',
       'Total night charge', 'Total intl minutes', 'Total intl calls',
       'Total intl charge', 'Customer service calls'])


In [16]:
# Corroboramos la normalizacion graficamente.
df1.describe()

Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
count,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0
mean,0.485858,0.290748,0.098975,0.27597,0.171474,0.49968,0.508048,0.49966,0.50012,0.490129,0.50011,0.50413,0.499426,0.504053,0.499494,0.365631,0.499895,0.435577,0.109675
std,0.19162,0.418435,0.298694,0.447101,0.290138,0.184155,0.182841,0.184149,0.181524,0.184322,0.181588,0.182659,0.180854,0.182736,0.184738,0.227904,0.184713,0.325584,0.312553
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.352941,0.0,0.0,0.0,0.0,0.372322,0.380952,0.372314,0.375,0.367925,0.374892,0.377509,0.375,0.377273,0.371429,0.222222,0.373016,0.333333,0.0
50%,0.485294,0.068627,0.0,0.0,0.0,0.499824,0.504762,0.499793,0.500733,0.490566,0.500863,0.501859,0.509615,0.501653,0.5,0.333333,0.5,0.333333,0.0
75%,0.617647,1.0,0.0,1.0,0.404255,0.626273,0.628571,0.62624,0.624633,0.613208,0.624676,0.635316,0.625,0.635537,0.628571,0.444444,0.62963,0.666667,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
# Descargar csv.
df1.to_csv("Prepro01.csv", index=False)

##02-Preprocesamiento
En este bloque, desarrollamos la SEGUNDA opcion de preprocesamiento donde abarca las siguientes tareas:
- Transformacion de tipos de dato.
- Creacion de dummys  correspondientes a "State".
- Eliminacion de outliers.
- Normalizacion.


In [18]:
# 01-Transformamos de bool a int.
df2['Churn'] = df2['Churn'].astype(int)


# 02-Transformamos las variables categoricas a int.
df2['International plan'] = df2['International plan'].map({'Yes': 1, 'No': 0})
df2['Voice mail plan'] = df2['Voice mail plan'].map({'Yes': 1, 'No': 0})


# Creacion de dummies
df2=pd.get_dummies(df2,drop_first=True)


# 04-Eliminacion de outliers.
def eliminar_filas_con_outliers(dataframe, columns):
    df_copy = dataframe.copy()
    for column in columns:
        q1 = df_copy[column].quantile(0.25)
        q3 = df_copy[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df_copy = df_copy[(df_copy[column] >= lower_bound) & (df_copy[column] <= upper_bound)]
    return df_copy
df2=eliminar_filas_con_outliers(df2,['Account length', 'Area code',
       'Number vmail messages', 'Total day minutes', 'Total day calls',
       'Total day charge', 'Total eve minutes', 'Total eve calls',
       'Total eve charge', 'Total night minutes', 'Total night calls',
       'Total night charge', 'Total intl minutes', 'Total intl calls',
       'Total intl charge', 'Customer service calls'])


# 05-Normalizacion.
def normalizar_variables(dataframe, columns):
    df_copy = dataframe.copy()
    scaler = MinMaxScaler()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy
df2=normalizar_variables(df2,['Account length', 'Area code',
       'Number vmail messages', 'Total day minutes', 'Total day calls',
       'Total day charge', 'Total eve minutes', 'Total eve calls',
       'Total eve charge', 'Total night minutes', 'Total night calls',
       'Total night charge', 'Total intl minutes', 'Total intl calls',
       'Total intl charge', 'Customer service calls'])

In [19]:
df2.describe()

Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
count,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,...,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0,2243.0
mean,0.485858,0.290748,0.098975,0.27597,0.171474,0.49968,0.508048,0.49966,0.50012,0.490129,...,0.019171,0.016942,0.020508,0.023183,0.024521,0.020508,0.018725,0.024075,0.033437,0.022737
std,0.19162,0.418435,0.298694,0.447101,0.290138,0.184155,0.182841,0.184149,0.181524,0.184322,...,0.137155,0.129081,0.141763,0.150519,0.154694,0.141763,0.135582,0.153316,0.179816,0.149098
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.352941,0.0,0.0,0.0,0.0,0.372322,0.380952,0.372314,0.375,0.367925,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.485294,0.068627,0.0,0.0,0.0,0.499824,0.504762,0.499793,0.500733,0.490566,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.617647,1.0,0.0,1.0,0.404255,0.626273,0.628571,0.62624,0.624633,0.613208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
# Descargamos csv.
df2.to_csv("Prepro02.csv", index=False)