# Modelo de regresion lineal con sklearn y tratamiento de variables categoricas
Realizare mi modelo de regresion lineal usando sklearn para que sea rapido y efectivo

In [48]:
from sklearn.feature_selection import RFE #Resourced Fisher Elimination
from sklearn.svm import SVR #super vector machine
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../../datasets/ecom-expense/Ecom Expense.csv")
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


las varaibles Gender y City Tier son **categoricas**, vamos a crear para cada una varaibles Dummy

In [3]:
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender")
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City")
#esto dara una columna para cada valor y las llenara con 0´s y 1´s si aplica o no

In [4]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [5]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


Ahora debemos hacer un **Join con el dataset principal**

In [6]:
column_names = df.columns.values.tolist() #tomo los nombres de las columnas originales  y los guardo en una lista (array)
column_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend']

In [7]:
df_new = df[column_names].join(dummy_gender)#uno las columnas actuales con los datos dummy de genero
colum_names = df_new.columns.values.tolist()#guardo los nuevos nombres de columnas
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0


Ahora agregamos el valor de City

In [8]:
df_new = df_new[colum_names].join(dummy_city_tier)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


Lo siguiente es ver como integrar estos valores al modelo y como acceder a sus respectivos coeficientes. **Vamos a obtener solo lo que necesitamos**

In [9]:
feature_cols_1 = ['Age ', ' Items ','Monthly Income','Transaction Time', 
                'Gender_Male', 'City_Tier 1', 'Record'] #seran variables predictorias

#
# feature_cols_1 = ['Age ', ' Items ','Monthly Income','Transaction Time', 'Gender_Female', 
#                'Gender_Male', 'City_Tier 1', 
#                'City_Tier 2', 'City_Tier 3', 'Record'] #seran variables predictorias

Se incluyen todas las variables para que la libreria decida cuales son la mejor opcion.

In [10]:
X = df_new[feature_cols_1]#varaibles predictorias en x
Y = df_new["Total Spend"]#variable a predecir

indico el estimador y modelo a utilizar

In [11]:
estimator = SVR(kernel="linear")
selector = RFE(estimator,5, step=1) #le indico que quiero 8 varaibles
selector = selector.fit(X,Y)

In [12]:
selector.support_ #que variables se quedaron? 

array([ True,  True, False, False,  True,  True,  True])

In [13]:
selector.ranking_ 

array([1, 1, 2, 3, 1, 1, 1])

Ahora realizemos la regresion lineal

In [14]:
from sklearn.linear_model import LinearRegression

In [29]:
X_pred= X[['Age ', ' Items ', 'Gender_Male', 'City_Tier 1', 'Record']]#varaibles que se incluyeron en el modelo

In [30]:
lm = LinearRegression()
lm.fit(X_pred,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
lm.intercept_ #la alpha

1584.8038308339592

In [32]:
lm.coef_

array([  7.32203829,  35.88374049, 281.76545906,  66.09652978,
       778.26657405])

In [33]:
lm.score(X_pred,Y) #valor de R^2 ajustado

0.7422980676778793

In [37]:
lm.get_params

<bound method BaseEstimator.get_params of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)>

In [45]:
df_new["prediction"] = lm.predict(X_pred)

In [46]:
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,6208.596244
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4382.402396
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,4161.959383
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,7859.589439
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3652.426757


In [49]:
SSD = np.sum((df_new["prediction"] - df_new["Total Spend"])**2)
SSD

4769348004.145473

In [52]:
RSE = np.sqrt(SSD/(len(df_new)-5-1))
RSE

1422.7934699478565

In [53]:
sales_mean=np.mean(df_new["Total Spend"])
sales_mean

6163.176415976714

In [54]:
error = RSE/sales_mean
error*100 #100 para hacerlo en porcentaje

23.085392562503472