# Variables Categoricas

In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

In [150]:
data = pd.read_csv('datasets/ecom-expense/Ecom Expense.csv')

In [151]:
data.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [152]:
dummy_gender = pd.get_dummies(data['Gender'], prefix='Gender')
dummy_city_tier = pd.get_dummies(data['City Tier'], prefix='City')

In [153]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [154]:
columns_names = data.columns.values.tolist()

In [155]:
columns_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend']

In [156]:
data_new = data[columns_names].join(dummy_gender)
columns_names = data_new.columns.values.tolist()

In [157]:
data_new = data_new[columns_names].join(dummy_city_tier)
data_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [158]:
feature_cols = ['Monthly Income', 'Gender_Male',
                'Gender_Female', 'Transaction Time', 'Record',
                'City_Tier 1', 'City_Tier 2', 'City_Tier 3']

In [159]:
X = data_new[feature_cols]
Y = data_new['Total Spend']

In [160]:
lm = LinearRegression()
lm.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [161]:
lm.intercept_

-79.41713030135725

In [162]:
lm.coef_

array([ 1.47538980e-01,  1.31025013e+02, -1.31025013e+02,  1.54946125e-01,
        7.72233446e+02,  7.67643260e+01,  5.51389743e+01, -1.31903300e+02])

In [163]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.14753898049205727),
 ('Gender_Male', 131.02501325554667),
 ('Gender_Female', -131.0250132555466),
 ('Transaction Time', 0.15494612549586612),
 ('Record', 772.233445744565),
 ('City_Tier 1', 76.7643260104954),
 ('City_Tier 2', 55.1389743092325),
 ('City_Tier 3', -131.90330031972783)]

In [164]:
lm.score(X,Y)

0.9179923586131016

In [165]:
data_new['Prediccion'] = lm.predict(data_new[feature_cols])

In [166]:
data.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [167]:
SSD = sum((data_new['Prediccion'] - data_new['Total Spend'])**2)
SSD

1517733985.3408127

In [168]:
RSE = np.sqrt(SSD/(len(data_new) - len(feature_cols) - 1))
RSE

803.1318809818156

In [169]:
sales_mean = np.mean(data_new['Total Spend'])
sales_mean

6163.176415976714

In [170]:
error = RSE/sales_mean
error*100

13.031135680294145

## Enmarcado de variables dummies

In [171]:
dummy_gender = pd.get_dummies(data['Gender'], prefix='Gender').iloc[:,1:]
dummy_gender.head()

Unnamed: 0,Gender_Male
0,0
1,0
2,1
3,0
4,0


In [172]:
dummy_city_tier = pd.get_dummies(data['City Tier'], prefix='City').iloc[:,1:]
dummy_city_tier.head()

Unnamed: 0,City_Tier 2,City_Tier 3
0,0,0
1,1,0
2,1,0
3,0,0
4,1,0


In [173]:
columns_names = data.columns.values.tolist()
data_new = data[columns_names].join(dummy_gender)
columns_names += dummy_gender.columns.values.tolist()
data_new = data_new[columns_names].join(dummy_city_tier)
data_new

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Male,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,0,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,1,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,0,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,0,1,0
5,TXN006,49,6,6282,48.974268,2,Male,Tier 2,2375.036467,1,1,0
6,TXN007,21,14,7086,961.203768,8,Male,Tier 1,7494.474559,1,0,0
7,TXN008,58,9,8881,962.253740,10,Male,Tier 3,10782.944920,1,0,1
8,TXN009,20,6,5635,858.328131,5,Male,Tier 1,3854.277411,1,0,0
9,TXN010,48,12,20861,43.036737,4,Female,Tier 2,5346.140262,0,1,0


In [174]:
feature_cols = ['Record', 'Monthly Income', 'Gender_Male',
               'City_Tier 2', 'City_Tier 3', 'Transaction Time']
X = data_new[feature_cols]
Y = data_new['Total Spend']

In [175]:
lm = LinearRegression()
lm.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [176]:
list(zip(feature_cols, lm.coef_))

[('Record', 772.233445744565),
 ('Monthly Income', 0.14753898049204087),
 ('Gender_Male', 262.0500265110922),
 ('City_Tier 2', -21.625351701262616),
 ('City_Tier 3', -208.66762633022282),
 ('Transaction Time', 0.1549461254909616)]

In [177]:
lm.score(X,Y)

0.9179923586131016

## Transformacion de variables para conseguir una relacion no lineal

In [178]:
data_auto = pd.read_csv('datasets/auto/auto-mpg.csv')
data_auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [179]:
columns_names = data_auto.columns.values.tolist()
X_name = columns_names[3]
Y_name = columns_names[0]
X = data_auto[Y_name].fillna(data_auto[Y_name].mean())
Y = data_auto[X_name].fillna(data_auto[X_name].mean())

In [180]:
%matplotlib inline
plt.plot(data_auto[X], data_auto['mpg'], 'ro')
plt.xlabel('HP del vehiculo', color='#FFFFFF')
plt.ylabel('mpg del vehiculo',  color='#FFFFFF')
plt.title('HP vs MPG',  color='#FFFFFF')

KeyError: '[18.         15.         18.         16.         17.         15.\n 14.         14.         14.         15.         23.51457286 23.51457286\n 23.51457286 23.51457286 23.51457286 15.         14.         23.51457286\n 15.         14.         24.         22.         18.         21.\n 27.         26.         25.         24.         25.         26.\n 21.         10.         10.         11.          9.         27.\n 28.         25.         25.         23.51457286 19.         16.\n 17.         19.         18.         14.         14.         14.\n 14.         12.         13.         13.         18.         22.\n 19.         18.         23.         28.         30.         30.\n 31.         35.         27.         26.         24.         25.\n 23.         20.         21.         13.         14.         15.\n 14.         17.         11.         13.         12.         13.\n 19.         15.         13.         13.         14.         18.\n 22.         21.         26.         22.         28.         23.\n 28.         27.         13.         14.         13.         14.\n 15.         12.         13.         13.         14.         13.\n 12.         13.         18.         16.         18.         18.\n 23.         26.         11.         12.         13.         12.\n 18.         20.         21.         22.         18.         19.\n 21.         26.         15.         16.         29.         24.\n 20.         19.         15.         24.         20.         11.\n 20.         21.         19.         15.         31.         26.\n 32.         25.         16.         16.         18.         16.\n 13.         14.         14.         14.         29.         26.\n 26.         31.         32.         28.         24.         26.\n 24.         26.         31.         19.         18.         15.\n 15.         16.         15.         16.         14.         17.\n 16.         15.         18.         21.         20.         13.\n 29.         23.         20.         23.         24.         25.\n 24.         18.         29.         19.         23.         23.\n 22.         25.         33.         28.         25.         25.\n 26.         27.         17.5        16.         15.5        14.5\n 22.         22.         24.         22.5        29.         24.5\n 29.         33.         20.         18.         18.5        17.5\n 29.5        32.         28.         26.5        20.         13.\n 19.         19.         16.5        16.5        13.         13.\n 13.         31.5        30.         36.         25.5        33.5\n 17.5        17.         15.5        15.         17.5        20.5\n 19.         18.5        16.         15.5        15.5        16.\n 29.         24.5        26.         25.5        30.5        33.5\n 30.         30.5        22.         21.5        21.5        43.1\n 36.1        32.8        39.4        36.1        19.9        19.4\n 20.2        19.2        20.5        20.2        25.1        20.5\n 19.4        20.6        20.8        18.6        18.1        19.2\n 17.7        18.1        17.5        30.         27.5        27.2\n 30.9        21.1        23.2        23.8        23.9        20.3\n 17.         21.6        16.2        31.5        29.5        21.5\n 19.8        22.3        20.2        20.6        17.         17.6\n 16.5        18.2        16.9        15.5        19.2        18.5\n 31.9        34.1        35.7        27.4        25.4        23.\n 27.2        23.9        34.2        34.5        31.8        37.3\n 28.4        28.8        26.8        33.5        41.5        38.1\n 32.1        37.2        28.         26.4        24.3        19.1\n 34.3        29.8        31.3        37.         32.2        46.6\n 27.9        40.8        44.3        43.4        36.4        30.\n 44.6        40.9        33.8        29.8        32.7        23.7\n 35.         23.6        32.4        27.2        26.6        25.8\n 23.5        30.         39.1        39.         35.1        32.3\n 37.         37.7        34.1        34.7        34.4        29.9\n 33.         34.5        33.7        32.4        32.9        31.6\n 28.1        23.51457286 30.7        25.4        24.2        22.4\n 26.6        20.2        17.6        28.         27.         34.\n 31.         29.         27.         24.         23.         36.\n 37.         31.         38.         36.         36.         36.\n 34.         38.         32.         38.         25.         38.\n 26.         22.         32.         36.         27.         27.\n 44.         32.         28.         31.        ] not in index'

## Modelo de regresion Lineal

In [None]:
X_data = X[:, np.newaxis]
lm = LinearRegression()
lm.fit(X_data,Y)

In [None]:
%matplotlib inline
plt.plot(X, Y, 'ro')
plt.plot(X, lm.predict(X_data), color='blue')

In [None]:
lm.score(X_data,Y)

In [None]:
SSD = sum((Y - lm.predict(X_data))**2)
SSD

In [None]:
RSE = np.sqrt(SSD / (len(X_data) - 1))
y_mean =  np.mean(Y)
error = RSE / y_mean
error*100, RSE, y_mean

## Modelo de regresion cuadratico
* mpg = a + b * HP^2

In [None]:
X_data = X**2
X_data = X_data[:, np.newaxis]

In [None]:
lm = LinearRegression()
lm.fit(X_data,Y)

In [None]:
%matplotlib inline
plt.plot(X, Y, 'ro')
plt.plot(X, lm.predict(X_data), 'bo')

In [None]:
lm.score(X_data,Y)

In [None]:
SSD = sum((Y - lm.predict(X_data))**2)
RSE = np.sqrt(SSD / (len(X_data) - 1))
y_mean =  np.mean(Y)
error = RSE / y_mean
error*100, RSE, y_mean, SSD

## Polinomio de grado 2 completo

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

In [None]:
poly = PolynomialFeatures(degree=2)

In [None]:
X_data = poly.fit_transform(X[:, np.newaxis])

In [None]:
lm = linear_model.LinearRegression()
lm.fit(X_data,Y)

In [None]:
lm.score(X_data,Y)

## Problema de los Outliers

In [None]:
plt.plot(data_auto['displacement'],data_auto['mpg'], 'ro')

In [None]:
X = data_auto['displacement'].fillna(data_auto['displacement'].mean())
Y = data_auto['mpg'].fillna(data_auto['mpg'].mean())
X = X[:, np.newaxis]
lm = LinearRegression()
lm.fit(X, Y)

In [None]:
lm.score(X,Y)

In [None]:
%matplotlib inline
plt.plot(data_auto['displacement'],data_auto['mpg'], 'ro')
plt.plot(X, lm.predict(X), color='blue')

In [None]:
data_auto[(data_auto['displacement']>300) & (data_auto['mpg']>23)]

In [None]:
data_auto_clean = data_auto.drop([11, 12, 13, 14, 17, 372])

In [None]:
X = data_auto_clean['displacement'].fillna(data_auto_clean['displacement'].mean())
Y = data_auto_clean['mpg'].fillna(data_auto_clean['mpg'].mean())
X = X[:, np.newaxis]
lm = LinearRegression()
lm.fit(X, Y)

In [None]:
lm.score(X, Y)

In [None]:
%matplotlib inline
plt.plot(data_auto_clean['displacement'],data_auto_clean['mpg'], 'ro')
plt.plot(X, lm.predict(X), color='blue')