In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score


In [33]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [34]:
#filtrar solo columnas numericas
columnas_numericas=df_train.select_dtypes(include=['float64','int64'])
#Correlacion entre variables
correlacion=columnas_numericas.corr()
#Filtro la correlacion para SalePrice
correlacion['SalePrice'].sort_values(ascending=False).head(10)

SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
Name: SalePrice, dtype: float64

In [35]:
X_train=df_train[['OverallQual','GrLivArea','GarageCars','YearBuilt','TotalBsmtSF']]
X_test=df_test[['OverallQual','GrLivArea','GarageCars','YearBuilt','TotalBsmtSF']]
y_train=df_train[['SalePrice']]
#Si no hay dato es porque es nulo
X_train=X_train.fillna(0)
X_test=X_test.fillna(0)

In [39]:
#LinearRegression para predecir valor continuo
modelo=LinearRegression()
modelo.fit(X_train,y_train)
#Prediccion
predicciones=modelo.predict(X_test)
#print(predicciones.shape)
#Predicciones es 2D, lo paso a 1D para hacer dataframe de submission
predicciones=predicciones.flatten()
#print(predicciones)
#Genero submission.csv
submission=pd.DataFrame({'Id': df_test['Id'],
                         'SalePrice':predicciones})
submission.head()
submission.to_csv('HousePrice_LinearRegression_MaxiFrank7.csv', index=False)

In [38]:
#Usando el randomforest(Bosque Aleatorio). Son arboles de decision 
#n_estimators=100 -> Crea 100 Ã¡rboles
modeloRF = RandomForestRegressor(n_estimators=100, random_state=42)
modeloRF.fit(X_train, y_train)
prediccionesRF=modeloRF.predict(X_test)
print(prediccionesRF.shape)
submission=pd.DataFrame({'Id': df_test['Id'],
                         'SalePrice':prediccionesRF})
submission.head()
submission.to_csv('HousePrice_RandomForest_MaxiFrank7.csv', index=False)

  return fit_method(estimator, *args, **kwargs)


(1459,)


In [None]:
#Para tener en cuenta columnas numericas como kitchen o neighborhood uso pd.get_dummies
#pd.get_dummies es un one hot encoding. Hay 25 barrios, pone 1 en el barrio actual y 0 en los demas
features=['OverallQual','GrLivArea','GarageCars','YearBuilt','TotalBsmtSF','Neighborhood', 'KitchenQual']
y = df_train['SalePrice']
#unimos los datos para que el get_dummies genere las mismas columnas para todos
df_total = pd.concat([df_train[features], df_test[features]])
#get_dummies sobre todo y relleno nulls
df_total_dummies = pd.get_dummies(df_total)
df_total_dummies = df_total_dummies.fillna(0)
#separo el train del test, de 0 a 1460 es train de 1461+ es test
X_train_dummies=df_total_dummies.iloc[:1460]
X_test_dummies=df_total_dummies.iloc[1460:]
#entreno el modelo
modelorf2= RandomForestRegressor(n_estimators=100, random_state=42)
modelorf2.fit(X_train_dummies,y_train)
#predicciones
prediccionesrf2=modelorf2.predict(X_test_dummies)
#Genero submission de nuevo
submission=pd.DataFrame({'Id': df_test['Id'],
                         'SalePrice':prediccionesrf2})
submission.head()
submission.to_csv('HousePrice_RandomForestDummies_MaxiFrank7.csv', index=False)


  return fit_method(estimator, *args, **kwargs)
