In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate, train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn import model_selection, preprocessing

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv("./dataset/train_label_balanced.csv")
df_train.shape

(3300, 82)

In [3]:
df_test = pd.read_csv("./dataset/test.csv")
df_test.shape

(1459, 80)

In [4]:
df_SalePrice = df_train['SalePrice']

In [5]:
df_train.drop(['SalePrice_label', 'Id', 'SalePrice'], axis=1, inplace=True)
df_train.shape

(3300, 79)

In [6]:
df_test.drop(['Id'], axis=1, inplace=True)
df_test.shape

(1459, 79)

In [7]:
df = pd.concat([df_train, df_test])

In [None]:
df.shape

In [None]:
df.info()

### Numeric

In [8]:
df_num = df.select_dtypes(include='number')
df_num.shape

(4759, 36)

In [9]:
df_num.fillna(df_num.mean(), inplace=True)
df_num.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


(4759, 36)

In [None]:
df_num.info()

In [None]:
df_num.describe()

In [10]:
dic_c_unique_len = {}
for c in df_num.columns:
    dic_c_unique_len[c] = len(df_num[c].unique())
    #print(c, len(df_num[c].unique()))
 

df_c_unique_len = pd.DataFrame.from_dict(dic_c_unique_len, orient="index").reset_index()
df_c_unique_len.columns=['variable', 'count']
df_c_unique_len[df_c_unique_len['count'] < 20].sort_values('count')

Unnamed: 0,variable,count
19,HalfBath,3
17,BsmtHalfBath,4
21,KitchenAbvGr,4
16,BsmtFullBath,5
18,FullBath,5
23,Fireplaces,5
35,YrSold,5
25,GarageCars,7
20,BedroomAbvGr,8
4,OverallCond,9


In [None]:
df_num.shape

In [11]:
df_num_dummies_list = []


for index, row in df_c_unique_len[df_c_unique_len['count'] < 20].iterrows():
    df_num_dummies_list.append( pd.get_dummies(df_num[row['variable']], prefix=row['variable']))
    df_num.drop([row['variable']], axis=1, inplace=True)
    #print(row['variable'], row['count'])
    
len(df_num_dummies_list)

df_num_dummies = df_num_dummies_list[0]

for i in range(1, len(df_num_dummies_list)-1):
    df_num_dummies = df_num_dummies.join(df_num_dummies_list[i])

df_num_dummies.head()    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


MemoryError: Unable to allocate 39.9 MiB for an array with shape (14, 2989873) and data type uint8

In [None]:
df_num.head()

In [None]:
df_num.shape

##### df_num doit être standardisé

### Object

In [None]:
df_cat = df.select_dtypes(include='object')
df_cat.shape 

In [None]:
df_cat.info()

##### Remplacer les nan par la valeur la plus présente

In [None]:
df_cat.fillna(df_cat.mode().iloc[0], inplace=True)

In [None]:
df_cat.info()

In [None]:
dic_c_cat_unique_len = {}
for c in df_cat.columns:
    dic_c_cat_unique_len[c] = len(df_cat[c].unique())
    #print(c, len(df_num[c].unique()))
 

df_c_cat_unique_len = pd.DataFrame.from_dict(dic_c_cat_unique_len, orient="index").reset_index()
df_c_cat_unique_len.columns=['variable', 'count']
df_c_cat_unique_len[df_c_cat_unique_len['count'] < 20].sort_values('count')

In [None]:
df_cat_dummies_list = []


for index, row in df_c_cat_unique_len.iterrows():
    df_cat_dummies_list.append( pd.get_dummies(df_cat[row['variable']], prefix=row['variable']))
    df_cat.drop([row['variable']], axis=1, inplace=True)
    #print(row['variable'], row['count'])
    
len(df_cat_dummies_list)



In [None]:


df_cat_dummies = pd.DataFrame() #df_cat_dummies_list[0]

for i in range(0, len(df_cat_dummies_list)-1):
    df_cat_dummies = df_cat_dummies.join(df_cat_dummies_list[i])
    print(i, len(df_cat_dummies_list[i].columns))
    if i%11 == 0:
        df_cat_dummies.to_csv(f"./dataset/tmp/df_cat_dummies_{i}.csv", index=False)
        df_cat_dummies = pd.DataFrame()
        print('init df')
    

df_cat_dummies.head() 

### Standardisation des variables numériques (hors hot spot)

In [None]:
scaler = preprocessing.StandardScaler().fit(df_num)
df_num[df_num.columns] = pd.DataFrame(scaler.transform(df_num), index=df_num.index)

In [None]:
df_num.head()

In [None]:
print(df_num.shape, df_num_dummies.shape, df_cat_dummies.shape)

In [None]:
data = df_num.join(df_num_dummies).join(df_cat_dummies) #df.drop(['SalePrice'], axis=1)
target = df_SalePrice

### Réduction de dimension (PCA)

In [None]:
pca = PCA(n_components = 0.9)
pca.fit(data)
print("Nombre de composantes retenues :", pca.n_components_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=101)

In [None]:
pca = PCA(n_components = 0.9)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

### Elastic Net

In [None]:
model_en = ElasticNetCV(cv=8, l1_ratio=(0.1, 0.25, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 1), 
                        alphas=(0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 0.8, 1))

model_en.fit(X_train_pca, y_train)

In [None]:
alphas = model_en.alphas_

plt.figure(figsize=(10, 10))

for i in range(model_en.mse_path_.shape[0]) :
    plt.plot(alphas, model_en.mse_path_[i,:,:].mean(axis=1),
             label='Moyenne pour l1_ratio= %.2f' %model_en.l1_ratio[i], linewidth=2)

plt.xlabel('Alpha')
plt.ylabel('Mean squared error')
plt.title('Mean squared error pour chaque $\lambda$')
plt.legend();

In [None]:
pred_train = model_en.predict(X_train_pca)
pred_test = model_en.predict(X_test_pca)
print("rmse train :", np.sqrt(mean_squared_error(y_train, pred_train)))
print('rmse test :', np.sqrt(mean_squared_error(y_test, pred_test)))

In [None]:
print("score train :",model_en.score(X_train_pca, y_train))
print("score test :", model_en.score(X_test_pca, y_test))

In [None]:
moy = scaler.mean_[-1]
ec = scaler.scale_[-1]
print("moyenne :", moy)
print("ecart-type :", ec)

pd.DataFrame({'points_obsérvés': (y_test*ec)+moy, 'points_predits': np.round((pred_test*ec)+moy)}, 
             index=X_test.index).head(7)