In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import plotly.express as px
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.compose import ColumnTransformer
from statsmodels.formula.api import ols
import statsmodels.api as sm

# First iteration : Baseline

In [2]:
first_it = pd.read_pickle("data/first_iteration.pkl")
first_it.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          16512 non-null  int64  
 1   longitude           16512 non-null  float64
 2   latitude            16512 non-null  float64
 3   housing_median_age  16512 non-null  float64
 4   total_rooms         16512 non-null  float64
 5   total_bedrooms      16512 non-null  float64
 6   population          16512 non-null  float64
 7   households          16512 non-null  float64
 8   median_income       16512 non-null  float64
 9   median_house_value  16512 non-null  float64
 10  ocean_proximity     16512 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [3]:
# define X, y
y = first_it["median_house_value"]
X = first_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [5]:
log_model = LinearRegression()

log_model.fit(X_train, y_train)

# Evaluate the model with the test set
log_model.score(X_test, y_test)

0.6311871393336708

In [6]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6348249131619211

# Second iteration : Outliers

In [7]:
second_it = pd.read_pickle("data/second_iteration.pkl")
second_it.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15262 entries, 0 to 16511
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          15262 non-null  int64  
 1   longitude           15262 non-null  float64
 2   latitude            15262 non-null  float64
 3   housing_median_age  15262 non-null  float64
 4   total_rooms         15262 non-null  float64
 5   total_bedrooms      15262 non-null  float64
 6   population          15262 non-null  float64
 7   households          15262 non-null  float64
 8   median_income       15262 non-null  float64
 9   median_house_value  15262 non-null  float64
 10  ocean_proximity     15262 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [8]:
# define X, y
y = second_it["median_house_value"]
X = second_it.drop(["ocean_proximity","median_house_value"], axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [10]:
log_model.fit(X_train, y_train)

# Evaluate the model with the test set
log_model.score(X_test, y_test)

0.6076476528019652

In [11]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6334931888672133

# Third iteration : Median imputation

In [12]:
third_it = pd.read_pickle("data/third_iteration.pkl")
third_it.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          16512 non-null  int64  
 1   longitude           16512 non-null  float64
 2   latitude            16512 non-null  float64
 3   housing_median_age  16512 non-null  float64
 4   total_rooms         16512 non-null  float64
 5   total_bedrooms      16512 non-null  float64
 6   population          16512 non-null  float64
 7   households          16512 non-null  float64
 8   median_income       16512 non-null  float64
 9   median_house_value  16512 non-null  float64
 10  ocean_proximity     16512 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [13]:
# define X, y
y = third_it["median_house_value"]
X = third_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [15]:
log_model.fit(X_train, y_train)

# Evaluate the model with the test set
log_model.score(X_test, y_test)

0.6322340999259999

In [16]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6358408199481806

# Fourth iteration : Mean imputation

In [17]:
fourth_it = pd.read_pickle("data/fourth_iteration.pkl")

In [18]:
# define X, y
y = fourth_it["median_house_value"]
X = fourth_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [19]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6358408199481806

# Fifth iteration : iterative imputation

In [20]:
fifth_it = pd.read_pickle("data/fifth_iteration.pkl")

In [21]:
# define X, y
y = fifth_it["median_house_value"]
X = fifth_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [22]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6348249131619182

# Sixth iteration : ocean_proximity encoding

In [23]:
sixth_it = pd.read_pickle("data/sixth_iteration.pkl")

In [24]:
# define X, y
y = sixth_it["median_house_value"]
X = sixth_it.drop(["median_house_value"], axis=1)

In [25]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6464622037671589

# Seventh iteration : normalization

In [26]:
seventh_it = pd.read_pickle("data/seventh_iteration.pkl")

In [27]:
# define X, y
y = seventh_it["median_house_value"]
X = seventh_it.drop(["median_house_value"], axis=1)

In [2]:
cv_results = cross_validate(log_model, X, y, cv=5,scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))
cv_results["test_r2"].mean()

NameError: name 'cross_validate' is not defined

# Eighth iteration : feature selection

In [34]:
from sklearn.feature_selection import SelectFromModel

In [35]:
lsvc = log_model.fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)



In [41]:
PD = pd.DataFrame(X_new)
PD

Unnamed: 0,0,1,2,3,4,5
0,-0.137635,0.534564,-0.032827,-1.258403,1.0,0.0
1,0.879836,-0.909979,-0.494784,1.610623,0.0,0.0
2,-0.312201,0.455091,-0.435204,-1.220425,1.0,0.0
3,0.620480,-0.713633,-0.723603,-1.233736,0.0,0.0
4,-0.830911,1.011403,-0.263373,0.114837,1.0,0.0
...,...,...,...,...,...,...
16507,-1.165080,1.852884,-0.595810,-0.423409,1.0,0.0
16508,-1.339646,1.161000,-0.262510,-0.660681,0.0,0.0
16509,-0.825923,1.539666,-0.197750,0.333675,1.0,0.0
16510,0.710257,-0.685584,0.959301,-0.680911,0.0,0.0


In [42]:
# Get columns to keep and create new dataframe with those only
cols = log_model.get_support(indices=True)
features_df_new = PD.iloc[:,cols]

AttributeError: 'LinearRegression' object has no attribute 'get_support'

In [None]:
# Create and fit selector
selector = SelectKBest(f_classif, k=7)
selector.fit(X, y)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
features_df_new = features_df.iloc[:,cols]

Tree selection

In [1]:
data = immo_df3
X = data.iloc[:,0:10]  #independent columns
y = data.iloc[:,-1]    #target column i.e price range
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.featureimportances, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

NameError: name 'immo_df3' is not defined