# El tratamiento de las variables categóricas

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv("../datasets/ecom-expense/Ecom Expense.csv")

FileNotFoundError: [Errno 2] File b'../datasets/ecom-expense/Ecom Expense.csv' does not exist: b'../datasets/ecom-expense/Ecom Expense.csv'

In [None]:
df.head()
# Transaccion, Edad, Items que ha comprado, Ingresos mensuales, Tiempo de trx, Record, Sexo, Tipo economico, Gasto total

### Queremos predecir los gastos de un mes en particular en funcion si es hombre, mujer y dependiendo del sector economico

In [None]:
dummy_gender = pd.get_dummies(df["Gender"], prefix = "Gender")
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix = "City")

In [None]:
dummy_gender.head()

In [None]:
dummy_city_tier.head()

In [None]:
column_names = df.columns.values.tolist()
column_names

In [None]:
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()
df_new.head()

In [None]:
df_new = df_new[column_names].join(dummy_city_tier)
column_names = df_new.columns.values.tolist()
df_new.head()

In [None]:
feature_cols = ["Monthly Income", "Transaction Time", "Gender_Female"
                , "Gender_Male", "City_Tier 1", "City_Tier 2", "City_Tier 3"
               , "Record"]

In [None]:
X = df_new[feature_cols]
Y = df_new["Total Spend"]

In [None]:
lm = LinearRegression()
lm.fit(X,Y)

In [None]:
print(lm.intercept_)
print(lm.coef_)

In [None]:
list(zip(feature_cols, lm.coef_))

In [None]:
lm.score(X,Y)

El modelo puede ser escrito como:
    Total_Spend = -79.41713030137089 + 'Monthly Income' * 0.14753898049205746 + 'Transaction Time' * 0.1549461254958966 + 'Gender_Female' * -131.02501325554562 + 'Gender_Male' * 131.0250132555456 + 'City_Tier 1' * 76.76432601049542 + 'City_Tier 2' * 55.138974309232275 + 'City_Tier 3' * -131.90330031972783 + 'Record' * 772.2334457445639

In [None]:
df_new["prediction"] = lm.predict(df_new[feature_cols])
df_new.head()

In [None]:
SSD = sum((df_new["prediction"]-df_new["Total Spend"])**2)
RSE = np.sqrt(SSD/len(df_new)-len(feature_cols)-1)
df_mean = np.mean(df_new["Total Spend"])
error = RSE/df_mean
print("Error del modelo: " + str(round(error*100,2)) + "%")

## Eliminar variables dummy

In [None]:
dummy_gender = pd.get_dummies(df["Gender"], prefix = "Gender").iloc[:,:1]
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix = "City").iloc[:,0:2]
dummy_city_tier.head()

In [None]:
df_new = df.join(dummy_gender)
df_new = df_new.join(dummy_city_tier)
df_new.head()

In [None]:
feature_cols = ["Monthly Income", "Transaction Time", "Gender_Female", "City_Tier 1", "City_Tier 2", "Record"]
X = df_new[feature_cols]
Y = df_new["Total Spend"]
lm = LinearRegression().fit(X,Y)

In [None]:
df_new["Predict"] = lm.predict(df_new[feature_cols])

In [None]:
SSD = sum((df_new["Predict"]-df_new["Total Spend"])**2)
RSE = np.sqrt(SSD/len(df_new)-len(feature_cols)-1)
spend_mean = np.mean(df_new["Total Spend"])
error = RSE/spend_mean
print("Error del modelo: " + str(round(error*100,2)) + "%")

In [None]:
print("El modelo explica en un " + str(round(lm.score(X,Y)*100,2)) + "% a la variable real")

## Transformación de variables para conseguir una relación no lineal

In [None]:
import matplotlib.pyplot as plt
df_cars = pd.read_csv("../datasets/auto/auto-mpg.csv")
df_cars.head()

In [None]:
%matplotlib inline
df_cars["mpg"] = df_cars["mpg"].dropna()
df_cars["horsepower"] = df_cars["horsepower"].dropna()

plt.plot(df_cars["horsepower"], df_cars["mpg"], "ro")
plt.xlabel("Caballos de potencia")
plt.ylabel("Consumo (millas por galeon)")

## Modelo de regresión lineal
* mpg = a + b * horsepower

In [None]:
X = df_cars["horsepower"].fillna(df_cars["horsepower"].mean())
X = pd.DataFrame(X)
Y = df_cars["mpg"].fillna(df_cars["mpg"].mean())

In [None]:
lm_car = LinearRegression().fit(X,Y)

In [None]:
%matplotlib inline
plt.plot(X,Y,"ro")
plt.plot(X, lm_car.predict(X), "blue")

In [None]:
print("El modelo explica en un " + str(round(lm_car.score(X,Y)*100,2)) + "% a la variable real")

In [None]:
SSD = sum((Y-lm_car.predict(X))**2)
RSE = np.sqrt(SSD/len(X)-1)
error = RSE/np.mean(Y)
print("Error del modelo: " + str(round(error*100,2)) + "%")

## Modelo de regresión cuadrático
* mpg = a + b * horsepower^2

In [None]:
X2 = X**2
lm_car_2 = LinearRegression().fit(X2,Y)
%matplotlib inline
plt.plot(X2,Y,"ro")
plt.plot(X2, lm_car_2.predict(X2), "blue")

In [None]:
print("El modelo explica en un " + str(round(lm_car_2.score(X2,Y)*100,2)) + "% a la variable real")
SSD = sum((Y-lm_car_2.predict(X2))**2)
RSE = np.sqrt(SSD/len(X2)-1)
error = RSE/np.mean(Y)
print("Error del modelo: " + str(round(error*100,2)) + "%")

## Modelo de regresión lineal y cuadrático
* mpg = a + b * horsepower^2

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

In [None]:
poly = PolynomialFeatures(degree=2)

In [None]:
X_poly = poly.fit_transform(X)

In [None]:
lm_car_poly = linear_model.LinearRegression().fit(X_poly,Y)

In [None]:
%matplotlib inline
plt.plot(X_poly,Y,"ro")
plt.plot(X_poly, lm_car_poly.predict(X_poly), "blue")

In [None]:
print("El modelo explica en un " + str(round(lm_car_poly.score(X_poly,Y)*100,2)) + "% a la variable real")
SSD = sum((Y-lm_car_poly.predict(X_poly))**2)
RSE = np.sqrt(SSD/len(X_poly)-1)
error = RSE/np.mean(Y)
print("Error del modelo: " + str(round(error*100,2)) + "%")

In [None]:
def regression_validation(lm, X, Y, d):
    print("Modelo polinomial de grado " + str(d))
    print("El modelo explica en un " + str(round(lm.score(X,Y)*100,2)) + "% a la variable real")
    SSD = sum((Y-lm.predict(X))**2)
    RSE = np.sqrt(SSD/len(X)-1)
    error = RSE/np.mean(Y)
    print("Error del modelo: " + str(round(error*100,2)) + "%")
    print("---------------------------------------------")

In [None]:
for d in range(2,7):
    poly = PolynomialFeatures(degree=d)
    X_poly = poly.fit_transform(X)
    lm_car_poly = linear_model.LinearRegression().fit(X_poly,Y)
    regression_validation(lm_car_poly, X_poly, Y, d)

## El problema de los outliers

In [None]:
plt.plot(df_cars["displacement"], df_cars["mpg"], "ro")

In [None]:
X = df_cars["displacement"].fillna(df_cars["displacement"].mean())
X = pd.DataFrame(X)
Y = df_cars["mpg"].fillna(df_cars["mpg"].mean())

lm = LinearRegression()
lm.fit(X,Y)

In [None]:
print("Factor de R cuadrado: " + str(lm.score(X,Y)))
%matplotlib inline
plt.plot(X,Y,"ro")
plt.plot(X, lm.predict(X), "blue")

In [None]:
df_filter = df_cars[(df_cars["displacement"]>250) & (df_cars["mpg"]>35)]

df_filter