In [1]:
import pandas as pd
import numpy as np

In [2]:
# df viene de dataframe xd
df = pd.read_csv("kc_house_data.csv")
# se eliminan las columnas con información irrelevante para el analisis
df.drop(['id','date','zipcode'],axis=1,inplace=True)
# se muestran por defecto las primeras 5 filas del dataset
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503


In [3]:
# El dataset posee 21613 casos de estudio (casas a la venta)
# Además de 18 atributos relevantes para cada uno de estos casos
df.shape

(21613, 18)

In [4]:
# En su mayoría los atributos se representan con un numero entero
# Se utiliza float64 cuando se necesita representar una fracción
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 18 columns):
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(13)
memory usage: 3.0 MB


In [5]:
# En promedio las casas se venden por 540000USD
df.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,47.560053,-122.213896,1986.552492,12768.455652
std,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,0.138564,0.140828,685.391304,27304.179631
min,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,47.1559,-122.519,399.0,651.0
25%,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,47.471,-122.328,1490.0,5100.0
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,47.5718,-122.23,1840.0,7620.0
75%,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,47.678,-122.125,2360.0,10083.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,47.7776,-121.315,6210.0,871200.0


In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()
# Se llevan los datos de cada atributo a una normal estandar
# z=(x-\mu)/\sigma
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
# Para la variable a predecir se realiza linealizacion (log)
# Esto es para que la regresion lineal tenga sentido
df_scaled['price'] = np.log(df['price'])
# df_scaled.head()

In [42]:
import sklearn.linear_model as lm
# Se consideran todos los casos y todos los atributos menos el precio
X = df_scaled.iloc[:, 1:]
#Numero de casos (N=21613)
N = X.shape[0]
p = X.shape[1]
# Se inserta una columna llena de 1s al final del data set
# Corresponde al elemento que se multiplicara con el intercepto
# del atributo en la regresión lineal
X.insert(X.shape[1], 'intercept', np.ones(N))
# En y se guarda la columna con los precios, es decir los valores de la regresion
y = df_scaled['price']
# mascara estatica con el 70% de los datos
# se crea un vector lleno de 0s del largo de x
# podria reemplazarse con mascara=np.zeros(N)
mascara = np.zeros(len(X))
# 70% de los datos. De nuevo, creo que se puede reemplazar con N xd
# limit = int(N*0.7)
limit = int(len(X)*0.7)
# llena los primeros N*0.7 elementos de mascara con 1s
mascara[:limit] = 1
# istrain se define como un vector con las mismas dimensiones de mascara
# y que indica si el elemento es o no parte del 70% de entrenamiento
istrain = mascara == 1
# Con la ayuda de istrain se extraen los atributos y precio del 70% de 
# los casos, los cuales seran utilizados como entrenamiento
Xtrain = X[istrain]
ytrain = y[istrain]
# El resto de los datos se utilizan como test (30%)
Xtest = X[np.logical_not(istrain)]
ytest = y[np.logical_not(istrain)]
# Se define una regresion donde no se calcula el intercepto
# Se deja de esta forma ya que los datos ya se encuentran normalizados
# (centrados en 0)
linreg = lm.LinearRegression(fit_intercept = False)
# Finalmente se define la regresion lineal con los datos de entrenamiento
linreg.fit(Xtrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [67]:
weights = linreg.coef_
atribute_names = list(X.columns.values)
table1 = pd.DataFrame(d)
yhat = linreg.predict(Xtrain)
yerror = ytrain - yhat
yerror2 = yerror**2
Syerror2 = np.sum(yerror2)
sigma = np.sqrt(Syerror2/(N-p-1))
v = np.linalg.inv(np.dot(np.transpose(X),X))
vj = v.diagonal()
zcores = np.zeros(p+1)
for i in range(p+1):
    zc = weights[i]/(sigma*np.sqrt(vj[i]))
    zcores[i]=zc
    
d = {'Pesos' : pd.Series(weights, index =atribute_names),
    'Z-Score': pd.Series(zscores, index =atribute_names)}

  del sys.path[0]
