# CLASE 3: Regresión Lineal y logística

In [None]:
import utils, pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

pd.set_option('display.max_columns', 50)

# Linear Regression

In [None]:
x = np.arange(100)/100
y = x + np.random.normal(scale=0.05, size=100)

plt.scatter(x, y, alpha=0.5);

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.4, shuffle=False)
plt.scatter(x_train, y_train, alpha=0.5)
plt.scatter(x_val, y_val, alpha=0.5);

In [None]:
x_train = x_train[:, None]
x_val = x_val[:, None]

In [None]:
from sklearn.linear_model import LinearRegression

m = LinearRegression()
m.fit(x_train, y_train)

In [None]:
def score():
    print(f'Scores:')
    print(f'Train      = {m.score(x_train, y_train):.4}')
    print(f'Validation = {m.score(x_val, y_val):.4}')

In [None]:
score()

In [None]:
pred_train = m.predict(x_train)
pred_val = m.predict(x_val)

In [None]:
plt.scatter(x_train, pred_train, alpha=0.5)
plt.scatter(x_val, pred_val, alpha=0.5);

## Revisitando House Sales Prediction Dataset

House Sales Prediction Data Set: https://www.kaggle.com/harlfoxem/housesalesprediction/home

Abstract: This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.

- **id**: a notation for a house
- **date**: Date house was sold
- **price**: Price is prediction target
- **bedrooms**: Number of Bedrooms/House
- **bathrooms**: Number of bathrooms/bedrooms
- **sqft_living**: square footage of the home
- **sqft_lot**: square footage of the lot
- **floors**: Total floors (levels) in house
- **waterfront**: House which has a view to a waterfront
- **view**: Has been viewed
- **condition**: How good the condition is ( Overall )
- **grade**: overall grade given to the housing unit, based on King County grading system
- **sqft_above**: square footage of house apart from basement
- **sqft_basement**: square footage of the basement
- **yr_built**: Built Year
- **yr_renovated**: Year when house was renovated
- **zipcode**: zip
- **lat**: Latitude coordinate
- **long**: Longitude coordinate
- **sqft_living15**: Living room area in 2015(implies-- some renovations) This might or might not have affected the lotsize area
- **sqft_lot15**: lotSize area in 2015(implies-- some renovations)

In [None]:
df_raw = pd.read_csv('data/kc_house_data.csv', parse_dates=['date'])
print(df_raw.shape)
df_raw.head()

### Preprocesamiento (Clase 2)

In [None]:
# Realizar el preprocesamiento realizado en la Clase 2 

### Train-validation split

In [None]:
from sklearn.model_selection import train_test_split

x = df_raw.drop('price', axis=1)
y = df_raw['price']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
print(f'Train shape     : {x_train.shape}')
print(f'Validation shape: {x_val.shape}')

In [None]:
from sklearn.linear_model import LinearRegression

m = LinearRegression()
m.fit(x_train, y_train)

In [None]:
score() 

In [None]:
# Veamos el MSE
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, m.predict(x_train)), mean_squared_error(y_val, m.predict(x_val))

In [None]:
imp = pd.DataFrame({'col':x_train.columns, 
                    'importance':m.coef_}).sort_values(
    'importance', ascending=False)
imp

In [None]:
m.predict(x_val)[0]

In [None]:
m.intercept_

In [None]:
x_val.iloc[0]

In [None]:
x0 = pd.DataFrame(x_val.iloc[0].rename('value'))
x0['col'] = x0.index
x0 = pd.merge(x0, imp, on='col')
x0['contribution'] = x0.value * x0.importance

In [None]:
x0.sort_values('contribution', ascending=False).style.bar(['contribution'])

In [None]:
m.intercept_ + x0.contribution.sum()

In [None]:
from utils import waterfallplot
waterfallplot(x_val.head(1), x0.contribution, formatting='{:,.3f}', size=(13,6), sorted_value=True, threshold=0.05);

# Logistic Regression

## Revisitando Census Income Dataset

Census Income Data Set: http://mlr.cs.umass.edu/ml/datasets/Census+Income

Abstract: Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset.

* **age:** continuous.
* **workclass:** Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
* **fnlwgt:** continuous.
* **education:** Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
* **education-num:** continuous.
* **marital-status:** Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
* **occupation:** Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
* **relationship:** Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
* **race:** White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
* **sex:** Female, Male.
* **capital-gain:** continuous.
* **capital-loss:** continuous.
* **hours-per-week:** continuous.
* **native-country:** United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

In [None]:
df_raw = pd.read_csv('data/census_train.csv')
df_test = pd.read_csv('data/census_test.csv')
print(df_raw.shape, df_test.shape)
df_raw.head(3)

### Preprocesamiento (Clase 2)

In [1]:
# Realizar el preprocesamiento realizado en la Clase 2

### Probando el modelo (holdout validation)

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop('label', axis=1)
y = df['label']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
print(f'Train shape     : {x_train.shape}')
print(f'Validation shape: {x_val.shape}')

In [None]:
from sklearn.linear_model import LogisticRegression

m = LogisticRegression(penalty='l2',solver='liblinear')
m.fit(x_train, y_train)

In [None]:
score()

### Probando el modelo (cross validation)

In [None]:
from sklearn.model_selection import KFold
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
x = df.drop('label', axis=1)
y = df['label']

train_scores, val_scores = [], []

for train_idx, val_idx in kfolds.split(x):
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]
    m = LogisticRegression(penalty='l2',solver='liblinear')
    m.fit(x_train, y_train)
    score()
    print()
    train_scores.append(m.score(x_train, y_train))
    val_scores.append(m.score(x_val, y_val))

print('=========================')
print(f'Average Scores:')
print(f'Train      = {np.mean(train_scores):.4}')
print(f'Validation = {np.mean(val_scores):.4}')

### Variables categóricas: One-hot encoding

In [None]:
df_raw.head()

In [None]:
x = df_raw.drop('label', axis=1)
y = df_raw['label']

x = pd.get_dummies(x, drop_first=True)
x.shape

In [None]:
x.head()

### Variables continuas: Normalización

In [None]:
df_raw['workclass'].dtypes

In [None]:
cont_cols = [col for col in df_raw if \
             pd.api.types.is_numeric_dtype(df_raw[col])]

In [None]:
cont_cols

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_x = scaler.fit_transform(x[cont_cols])

In [None]:
for col, scaled_data in zip(cont_cols, scaled_x.T):
    x[col] = scaled_data

In [None]:
x.head()

In [None]:
train_scores, val_scores = [], []

for train_idx, val_idx in kfolds.split(x):
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]
    m = LogisticRegression(penalty='l2',solver='liblinear')
    m.fit(x_train, y_train)
    score()
    print()
    train_scores.append(m.score(x_train, y_train))
    val_scores.append(m.score(x_val, y_val))

print('=========================')
print(f'Average Scores:')
print(f'Train      = {np.mean(train_scores):.4}')
print(f'Validation = {np.mean(val_scores):.4}')

# Principal Component Analysis

In [None]:
sns.set()

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x)
plt.plot(np.cumsum(pca.explained_variance_ratio_))

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
print(f'Train shape     : {x_train.shape}')
print(f'Validation shape: {x_val.shape}')

In [None]:
train_pca = pca.transform(x_train)
val_pca   = pca.transform(x_val)

In [None]:
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [None]:
pos_train_idx = np.where(y_train == ' >50K')
neg_train_idx = np.where(y_train == ' <=50K')

In [None]:
plt.scatter(train_pca[pos_train_idx,0], train_pca[pos_train_idx,1], alpha=0.5)
plt.scatter(train_pca[neg_train_idx,0], train_pca[neg_train_idx,1], alpha=0.5)

In [None]:
pos_val_idx = np.where(y_val == ' >50K')
neg_val_idx = np.where(y_val == ' <=50K')

In [None]:
plt.scatter(val_pca[pos_val_idx,0], val_pca[pos_val_idx,1], alpha=0.5)
plt.scatter(val_pca[neg_val_idx,0], val_pca[neg_val_idx,1], alpha=0.5)

In [None]:
m = LogisticRegression(penalty='l2',solver='liblinear')
m.fit(x_train, y_train)
y_pred = m.predict(x_val)

In [None]:
pos_val_idx = np.where(y_pred == ' >50K')
neg_val_idx = np.where(y_pred == ' <=50K')

In [None]:
plt.scatter(val_pca[pos_val_idx,0], val_pca[pos_val_idx,1], alpha=0.5)
plt.scatter(val_pca[neg_val_idx,0], val_pca[neg_val_idx,1], alpha=0.5)