In [59]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error


In [2]:
df_modeling = pd.read_csv(filepath_or_buffer= "../data/Dataset_for_modeling2.csv") 
df_modeling

Unnamed: 0,Rating,Rating Count,Free,Price,Ad Supported,In App Purchases,Editors Choice,Category_Action,Category_Adventure,Category_Arcade,...,Category_Trivia,Category_Video Players & Editors,Category_Weather,Category_Word,Content_Rating_Adults only 18+,Content_Rating_Everyone,Content_Rating_Everyone 10+,Content_Rating_Mature 17+,Content_Rating_Teen,Content_Rating_Unrated
0,3.9,68.0,1,0.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.0,0.0,1,0.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,4.3,918.0,1,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,5.0,6.0,1,0.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4.3,830.0,1,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5385,2.5,6.0,1,0.0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5386,5.0,17.0,1,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5387,4.3,142.0,1,0.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5388,5.0,9.0,1,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Train-Test-Split

In [3]:
X = df_modeling.drop(columns= 'Rating')
y = df_modeling['Rating']

In [4]:
X.shape

(5390, 60)

In [5]:
y.shape

(5390,)

In [6]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size= 0.2, random_state = 1234)

In [73]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model',LinearRegression())
])

Params = [{
        'model__max_depth' : [None, 10, 20, 30],
        'model__min_samples_split' :[2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]}]

grid_search = GridSearchCV(estimator=pipeline, param_grid=Params, cv=5, verbose = 2)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5; tot

In [74]:
print("Best parameters are %s with a score of %0.2f"
% (grid_search.best_params_, grid_search.best_score_))

Best parameters are {'model__max_depth': 10, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10} with a score of 0.66


We run the model withouth the pca. We can see that the model max_depth reach the minimum range (how deep the tree is) 
the model__min_samples_leaf reach the top range(minumum number of sample at the base of the tree)
the model__min_sample_split reacht the top range(minimum number of samples required to split an internal node) 
We are going to rerun the model with different hyperparameters range and add pca. 


In [75]:
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)



Mean Squared Error: 0.4734985788994681


In [76]:
grid_search.score(X_test, y_test)

0.6291349785594242

In [61]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('model',DecisionTreeRegressor())
])

Params = [{
        'model__max_depth' : [2, 4, 6],
        'model__min_samples_split' :[10, 15, 20],
        'model__min_samples_leaf': [4, 6, 8]}]

grid_search = GridSearchCV(estimator=pipeline, param_grid=Params, cv=5, verbose = 2)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=10; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=10; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=10; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=10; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=10; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=15; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=15; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=15; total time=   0.0s
[CV] END model__max_depth=2, model__min_samples_leaf=4, model__min_samples_split=15; total time=   0.0s
[C

In [62]:
print("Best parameters are %s with a score of %0.2f"
% (grid_search.best_params_, grid_search.best_score_))

Best parameters are {'model__max_depth': 2, 'model__min_samples_leaf': 6, 'model__min_samples_split': 15} with a score of 0.03


For this model the max depth is only at 2 (the lowest range), the min samples leaf is at 6 (middle range) and the min samples split is at 10 (lowest range).


In [63]:
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.2206073838635676


In [64]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('model',DecisionTreeRegressor())
])

Params = [{
        'model__max_depth' : [None, 1],
        'model__min_samples_split' :[2, 4, 8],
        'model__min_samples_leaf': [1, 2, 3, 4]}]

grid_search = GridSearchCV(estimator=pipeline, param_grid=Params, cv=5, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.8s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.7s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.6s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   2.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.7s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=4; total time=   1.7s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=4; total time=   1.6s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=4; total time=   1.5s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=4; tot

In [65]:
print("Best parameters are %s with a score of %0.2f"
% (grid_search.best_params_, grid_search.best_score_))

Best parameters are {'model__max_depth': 1, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2} with a score of 0.02


For this model the max depth is only at 1, the min samples leaf is at 1(lower range) and the min samples split is at 8 (upper range).

In [66]:
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.2688725943809351


In [69]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('model',DecisionTreeRegressor())
])

Params = [{
        'model__max_depth' : [None, 1],
        'model__min_samples_split' :[1, 2, 3, 4, 5, 6, 7],
        'model__min_samples_leaf': [1, 2, 3, 4]}]

grid_search = GridSearchCV(estimator=pipeline, param_grid=Params, cv=5, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=1; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=1; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=1; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=1; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=1; total time=   0.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.8s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.8s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.6s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2; tot

40 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lelon\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lelon\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lelon\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\lelon\anaconda3\Lib\site-packages\sklearn\base.py", line

In [70]:
print("Best parameters are %s with a score of %0.2f"
% (grid_search.best_params_, grid_search.best_score_))

Best parameters are {'model__max_depth': 1, 'model__min_samples_leaf': 4, 'model__min_samples_split': 3} with a score of 0.02


In [71]:
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.2688725943809351


In [45]:
# Print the accuracy
metrics.r2_score(y_test, y_predicted)

plt.scatter(y_test, y_predicted)
plt.xlabel('Y Test')
plt.ylabel('Prediction')
plt.show()

plt.hist([y_test-y_predicted])
plt.title('Residuals Histogram')
plt.xlabel('residual')
plt.savefig('residuals.png', dpi=600)
plt.show()

plt.boxplot([y_test, y_predicted], vert=False)



NameError: name 'y_predicted' is not defined