# 3.1 - Ingenieria de caracteristicas


**Regresion Lineal . Ecuacion de la recta**

$$y = m·x+b$$

In [None]:
import pandas as pd
import numpy as np
import pylab as plt

In [None]:
x=[i for i in range(-2, 8)]

m=2
b=4

y=[m*x+b for x in x]

plt.plot(x, y)
plt.plot([0,0], [0,10], color='black')
plt.plot([0,10], [0,0], color='black')
plt.plot(0, b, 'ro')
plt.title('Recta');

**Generalizacion de la ecuacion de la recta**

$$y=\beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3+\ldots+\beta_nx_n$$

El objetivo de la regresion lineal es obtener los $\beta$:
+ Algebraicamente: $\beta = (X^{T}X)^{-1}X^{T}Y$
+ Minimos cuadrados

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

import warnings
warnings.filterwarnings('ignore')

In [None]:
auto = pd.read_csv('../data/auto-mpg.csv')

auto.head()

In [None]:
x = sm.add_constant(auto.acceleration)

y = auto.mpg

modelo = sm.OLS(y, x).fit()

In [None]:
y_pred = modelo.predict(x)

modelo.summary()

In [None]:
# plot

x=np.linspace(auto.acceleration.min(), auto.acceleration.max(), 50)

y=1.1912*x + 4.9698

plt.figure(figsize=(10, 5))

# la recta
plt.plot(x, y, label='mpg = 1.19*acc + 4.96')


# datos
plt.scatter(auto.acceleration, auto.mpg, marker='x', c='g', label='mpg')


plt.title('Regresion lineal')
plt.xlabel('Aceleracion')
plt.ylabel('MPG')
plt.legend()
plt.show();

In [None]:
# varias variables

x=sm.add_constant(auto[[
    
    #'acceleration',
    #'cylinders',
    'weight',
    #'horse_power',
    'model_year',
    #'displacement'
    
]].apply(lambda x: x.fillna(x.mean()))

)

y=auto.mpg


modelo=sm.OLS(y, x).fit()

pred=modelo.predict(x)

modelo.summary()

In [None]:
x.model_year.unique()

**los coeficientes, son los pesos de las caracteristicas**

In [None]:
from sklearn.linear_model import LinearRegression as LinReg

auto2=auto.dropna()

X=auto2.drop('mpg', axis=1)._get_numeric_data()

y=auto2.mpg

In [None]:
modelo = LinReg()

modelo.fit(X, y)

In [None]:
modelo.intercept_   # beta0, ordenada en el origen

In [None]:
dict(zip(X.columns, modelo.coef_))

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X_norm=StandardScaler().fit_transform(X)  # normalizacion N(0,1)

linreg=LinReg().fit(X_norm, y)

In [None]:
dict(zip(X.columns, linreg.coef_))

In [None]:
# plot

x=np.linspace(auto.model_year.min(), auto.model_year.max(), 50)

plt.figure(figsize=(10, 5))


# datos
plt.scatter(auto.model_year, auto.mpg, marker='x', c='g', label='mpg')

plt.title('Regresion lineal')
plt.xlabel('model_year')
plt.ylabel('MPG')
plt.legend()
plt.show();

## Regresion Logistica


$$y=\frac{1}{1 + e^{-(\beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3+\ldots+\beta_nx_n)}}$$

El objetivo de la regresion logistica es obtener los $\beta$.

In [None]:
iris=pd.read_csv('../data/iris.csv')

iris.head()

In [None]:
iris.variety.unique()

In [None]:
dictio = {'Setosa': 0, 'Versicolor': 1, 'Virginica': 2}

iris.variety = iris.variety.apply(lambda x: dictio[x])

iris.head()

In [None]:
X = iris.drop('variety', axis=1)

y = iris.variety

In [None]:
from sklearn.linear_model import LogisticRegression as LogReg

In [None]:
modelo = LogReg()

modelo.fit(X, y)

In [None]:
modelo.intercept_

In [None]:
dict(zip(X.columns, modelo.coef_.T))

In [None]:
{'Setosa': dict(zip(X.columns, modelo.coef_[0])),
 
'Versicolor': dict(zip(X.columns, modelo.coef_[1])),
 
'Virginica': dict(zip(X.columns, modelo.coef_[2]))}

## Decision Tree - Random Forest

In [None]:
from sklearn.tree import DecisionTreeRegressor as DTR

In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR

In [None]:
# dtr

X = auto2.drop(columns=['mpg', 'car_name'])

y = auto2.mpg

In [None]:
dtr = DTR().fit(X, y)

In [None]:
sum(dtr.feature_importances_)

In [None]:
dict(zip(X.columns, dtr.feature_importances_*100))

In [None]:
X_norm = StandardScaler().fit_transform(X)

dtr = DTR().fit(X_norm, y)

dict(zip(X.columns, dtr.feature_importances_*100))

In [None]:
# plot
plt.figure(figsize=(10, 5))


# datos
plt.scatter(auto.displacement, auto.mpg, marker='x', c='g', label='mpg')

plt.title('Regresion lineal')
plt.xlabel('displacement')
plt.ylabel('MPG')
plt.legend()
plt.show();

In [None]:
linreg.score(X_norm, y)

In [None]:
dtr.score(X_norm, y)

In [None]:
# rfr

rfr = RFR(n_estimators=2000).fit(X_norm, y)  # 2000 arboles

In [None]:
dict(zip(X.columns, rfr.feature_importances_*100))    # de media entre los 2000

In [None]:
sum(rfr.feature_importances_)

In [None]:
len(rfr.estimators_)

In [None]:
rfr.estimators_[0].feature_importances_

In [None]:
dict(zip(X.columns, rfr.estimators_[0].feature_importances_*100))    # de media entre los 2000

In [None]:
from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import mean_squared_error as mse

In [None]:
X_train, X_test, y_train, y_test = tts(X_norm, y, train_size=0.8)

In [None]:
# dtr

dtr = DTR()

dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)

mse(y_test, y_pred, squared=False)  # rmse


In [None]:
dtr.score(X_test, y_test)   #r2

In [None]:
# rfr

rfr = RFR(n_estimators=2000)

rfr.fit(X_train, y_train)

y_pred = rfr.predict(X_test)

mse(y_test, y_pred, squared=False)  # rmse


In [None]:
rfr.score(X_train, y_train)   #r2 train

In [None]:
rfr.score(X_test, y_test)   #r2 test

In [None]:
y_test.mean()

In [None]:
X['random'] = np.random.normal(size=len(X))

X.head()

In [None]:
X_norm=StandardScaler().fit_transform(X)

dtr = DTR().fit(X_norm, y)

dict(zip(X.columns, dtr.feature_importances_*100))    

In [None]:
X_nueva = X.drop(columns=['cylinders', 'acceleration', 'random'])

X_norm=StandardScaler().fit_transform(X_nueva)

X_train, X_test, y_train, y_test = tts(X_norm, y, train_size=0.8)

# rfr

rfr = RFR(n_estimators=2000)

rfr.fit(X_train, y_train)

y_pred = rfr.predict(X_test)

mse(y_test, y_pred, squared=False)  # rmse


In [None]:
dict(zip(X_nueva.columns, rfr.feature_importances_*100))    

In [None]:
rfr.score(X_train, y_train)   #r2 train

In [None]:
rfr.score(X_test, y_test)   #r2 test

## Relleno Nan

In [None]:
auto.info()

In [None]:
train = auto[auto.horse_power.notna()]  # selecciono lo NO nulo

In [None]:
train.info()

In [None]:
X = train.drop(columns=['horse_power', 'car_name'])

y = train.horse_power

In [None]:
rfr = RFR()

rfr.fit(X, y)

In [None]:
test = auto[auto.horse_power.isna()]  # selecciono lo nulo

test

In [None]:
X_test = test.drop(columns=['horse_power', 'car_name'])

X_test

In [None]:
y_pred = rfr.predict(X_test)

y_pred

In [None]:
auto.loc[auto[auto.horse_power.isna()].index, 'horse_power'] = y_pred

In [None]:
auto[auto.horse_power.isna()]

In [None]:
auto.info()