In [32]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
df=pd.read_csv(r'https://raw.githubusercontent.com/Jorge2018/RepositorioArchivos/main/insurance(1).csv')

In [33]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


**Exploracion de datos**

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


**Separacion de df (num y cat) para luego resumir las cols y el tipo de dato**

In [35]:
######################## Crea df numerico y categorico
df_num=df.select_dtypes(include=['float64', 'int64'])
df_cat=df.select_dtypes(exclude=['float64', 'int64'])
columns = list(df_num.columns)
for idx, column in enumerate(columns):
  print(f'la columna {column}, es numerica')
columns = list(df_cat.columns)
for idx, column in enumerate(columns):
  print(f'la columna {column}, es categorica')

la columna age, es numerica
la columna bmi, es numerica
la columna children, es numerica
la columna charges, es numerica
la columna sex, es categorica
la columna smoker, es categorica
la columna region, es categorica


**Manejo de nan con SimpleImputer en los datos num, reemplaza con la mediana, escalamiento de valores numericos (StandarScaler)**

In [36]:
######################## manejo de valores nulos y escalamiento de valores num
imputer = SimpleImputer(strategy="median")
imputer.fit(df_num)
print(imputer.statistics_) # valores de mediana por cada columna numérica
df_tr = pd.DataFrame(imputer.transform(df_num), columns=df_num.columns, index=df_num.index)
#df_tr.info() # Valores nulos reemplazados

############################## escalamiento de valores num
scaler = StandardScaler() # Creación de objeto StandardScaler
scaler.fit(df_tr) # Cálculo de parámetros de escalamiento
price_scaled = scaler.transform(df_tr) # Transformación de los datos
print('Media de los datos', scaler.mean_)
print('Varianza de los datos', scaler.var_)

[3.900000e+01 3.040000e+01 1.000000e+00 9.382033e+03]
Media de los datos [3.92070254e+01 3.06633969e+01 1.09491779e+00 1.32704223e+04]
Varianza de los datos [1.97253852e+02 3.71600900e+01 1.45212664e+00 1.46542766e+08]


**Tratamiento de datos nominales con OneHotEncoder**

In [26]:
###################################### codificacion one-hot de las características nominales
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
results = one_hot_encoder.fit_transform(df_cat)
df_encoded = pd.DataFrame.sparse.from_spmatrix(results)
df_encoded.columns = one_hot_encoder.get_feature_names(df_cat.columns)




**Actualizacion (creacion) del df con los valores ya convertidos**

In [27]:
#Concatenar todas las características de vuelta a un DataFrame.

df = pd.concat([df_tr, df_encoded], axis=1)

**Define caracteristicas X y**

In [28]:
# Definir las características (X) y el objetivo (y)
X = df.drop(["charges"], axis=1)
y = df["charges"]

**Realiza train test split y muestra los datos de entrenamiento y test**

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.info()) # Set de características de datos de training
print('______________________________________________________________')
print('')
print('')
print(X_test.info()) # Set de características de datos de test

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 560 to 1126
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype             
---  ------            --------------  -----             
 0   age               1070 non-null   float64           
 1   bmi               1070 non-null   float64           
 2   children          1070 non-null   float64           
 3   sex_female        1070 non-null   Sparse[float64, 0]
 4   sex_male          1070 non-null   Sparse[float64, 0]
 5   smoker_no         1070 non-null   Sparse[float64, 0]
 6   smoker_yes        1070 non-null   Sparse[float64, 0]
 7   region_northeast  1070 non-null   Sparse[float64, 0]
 8   region_northwest  1070 non-null   Sparse[float64, 0]
 9   region_southeast  1070 non-null   Sparse[float64, 0]
 10  region_southwest  1070 non-null   Sparse[float64, 0]
dtypes: Sparse[float64, 0](8), float64(3)
memory usage: 71.1 KB
None
______________________________________________________________


<cl

**Instancia y ajusta el modelo de regresion lineal a los datos de entrenamiento**

In [30]:
# Make a linear regression instance
reg = LinearRegression()
reg.fit(X_train,y_train)

  "pandas.DataFrame with sparse columns found."


LinearRegression()

**Codigo para obtener el R cuadrado (R^2), despues de realizado el ajuste sobre los datos de entrenamiento**

In [31]:
train_score = reg.score(X_train, y_train)
print(train_score)

0.7417255854683333


  "pandas.DataFrame with sparse columns found."
