In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn import preprocessing 

%matplotlib inline

In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor,  AdaBoostRegressor
from sklearn import model_selection as ms
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV, KFold
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv("data_train.csv")
test_data = pd.read_csv("data_test.csv")

In [None]:
X=data.drop(columns = 'price_doc')
y=data['price_doc']

#Convert Categorical Data using Label Encoder

In [None]:
filteredColumns = X.dtypes[(X.dtypes == np.object)]
listOfColumnNames = list(filteredColumns.index)
print("Number of columns with data type object are : ",len(listOfColumnNames))
print("\n",listOfColumnNames)

Number of columns with data type object are :  16

 ['timestamp', 'product_type', 'sub_area', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


In [None]:
#train data
for i in tqdm(listOfColumnNames):
 
    le = preprocessing.LabelEncoder() 
    le.fit(X[i])

    X[i] = le.transform(X[i])

#test data
for i in tqdm(listOfColumnNames):
 
    le = preprocessing.LabelEncoder() 
    le.fit(test_data[i])

    test_data[i] = le.transform(test_data[i])

100%|██████████| 16/16 [00:00<00:00, 108.66it/s]
100%|██████████| 16/16 [00:00<00:00, 402.21it/s]


#Random Forest Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state = 123)

In [None]:
%%time
RF = SelectFromModel(RandomForestRegressor(n_jobs=-1,max_depth=10))
RF.fit(X_train, y_train)

CPU times: user 4min 53s, sys: 156 ms, total: 4min 53s
Wall time: 2min 29s


In [None]:
train_filtered = RF.transform(X_train)
test_filtered = RF.transform(X_test)

print(train_filtered.shape,test_filtered.shape)

(24376, 28) (6095, 28)


In [None]:
%%time
prams={
     'n_estimators' : [50,100,150,200,250],
     'max_depth' : [10,11,12,13,14,15,16,17,18,19,20,25,30]    
}

random_rf=RandomizedSearchCV(RandomForestRegressor(),param_distributions=prams,verbose=10,n_jobs=-1,cv=2)
random_rf.fit(train_filtered,y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
CPU times: user 1min 21s, sys: 340 ms, total: 1min 21s
Wall time: 7min 33s


In [None]:
print(random_rf.best_params_)
print(random_rf.best_score_)

{'n_estimators': 200, 'max_depth': 19}
0.6582318688593403


In [None]:
RF_final=RandomForestRegressor(n_estimators=random_rf.best_params_['n_estimators'],max_depth=random_rf.best_params_['max_depth'],random_state=42,n_jobs=-1)
RF_final.fit(X_train,y_train)

RandomForestRegressor(max_depth=19, n_estimators=200, n_jobs=-1,
                      random_state=42)

In [None]:
ypred = RF_final.predict(X_train)
mse = mean_squared_error(y_train, ypred)
print("MSE for Train : %.2f" % mse)

ypred = RF_final.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE for Test : %.2f" % mse)



MSE for Train : 1231985507274.02
MSE for Test : 7930161763731.77


In [None]:
def evaluate_model(model, X, y):
	# define the evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate the model and collect the results
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

evaluate_model(RF_final, X_train, y_train)



KeyboardInterrupt: ignored

In [None]:
Xtest=test_data.values
prediction = RF_final.predict(Xtest)
output = pd.read_csv('data_test.csv')
output = output[['id']]
output['price_doc'] = prediction
output.to_csv('Submission_RF.csv',index=False)