In [211]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder , PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ridge_regression
from sklearn.linear_model import RANSACRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix , f1_score , accuracy_score , classification_report
from sklearn.preprocessing import OneHotEncoder

In [212]:
data = pd.read_csv('data.csv')

In [214]:
data = data.drop(['Unnamed: 0','name'], axis=1)

In [215]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2651 entries, 0 to 2650
Data columns (total 35 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand             2651 non-null   object 
 1   2G                2651 non-null   bool   
 2   3G                2651 non-null   bool   
 3   4G                2651 non-null   bool   
 4   5G                2651 non-null   bool   
 5   Announced         2651 non-null   float64
 6   Status            2651 non-null   object 
 7   Weight            2651 non-null   float64
 8   Length            2651 non-null   object 
 9   Width             2651 non-null   object 
 10  Diameter          2651 non-null   object 
 11  SIM               2651 non-null   object 
 12  Display Type      2651 non-null   object 
 13  Display Size      2651 non-null   float64
 14  ppi               2651 non-null   float64
 15  body ratio        2651 non-null   float64
 16  OS                2651 non-null   object 


In [216]:
data = data.drop(['Loudspeaker','3.5mm jack','Bluetooth','Colors','WLAN','Card slot','Network','Internal','ratio'], axis=1)

In [217]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2651 entries, 0 to 2650
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand             2651 non-null   object 
 1   2G                2651 non-null   bool   
 2   3G                2651 non-null   bool   
 3   4G                2651 non-null   bool   
 4   5G                2651 non-null   bool   
 5   Announced         2651 non-null   float64
 6   Status            2651 non-null   object 
 7   Weight            2651 non-null   float64
 8   Length            2651 non-null   object 
 9   Width             2651 non-null   object 
 10  Diameter          2651 non-null   object 
 11  SIM               2651 non-null   object 
 12  Display Type      2651 non-null   object 
 13  Display Size      2651 non-null   float64
 14  ppi               2651 non-null   float64
 15  body ratio        2651 non-null   float64
 16  OS                2651 non-null   object 


In [218]:
data.Status.unique()

array(['Available', 'Discontinued', 'Coming'], dtype=object)

In [219]:
data.Status.value_counts()

Status
Discontinued    1381
Available       1267
Coming             3
Name: count, dtype: int64

In [220]:
data['Status'] = data['Status'].replace('Coming', 'Available')

In [221]:
data.Status.unique()

array(['Available', 'Discontinued'], dtype=object)

In [222]:
data.Status.value_counts()

Status
Discontinued    1381
Available       1270
Name: count, dtype: int64

In [223]:
data['Length'] = data['Length'].str.extract('(\d+\.?\d*)').astype(float)

In [224]:
data['Width']=data['Width'].str.extract('(\d+\.?\d*)').astype(float)

In [225]:
def convert_to_average(diameter_str):
    # Check if diameter_str contains a dash, indicating a range
    if '-' in diameter_str:
        lower, upper = diameter_str.split('-')
        # Calculate the average of the lower and upper bounds
        average = (float(lower) + float(upper)) / 2
        return average
    else:
        # If not a range, just convert the string to float
        return float(diameter_str)

# Apply the function to the 'Diameter' column
data['Diameter'] = data['Diameter'].str.split().str[0].apply(convert_to_average)



In [226]:
data['Display Type'].value_counts()

Display Type
IPS LCD                                       1044
TFT                                            240
Super AMOLED                                   209
TFT, 256K colors                                92
AMOLED                                          58
                                              ... 
IPS LCD, 1B colors, 120Hz                        1
AMOLED, 1B colors, 120Hz, 1300 nits (peak)       1
AMOLED, 770 nits (HBM)                           1
AMOLED, 1B colors, 144Hz, 1000 nits (peak)       1
Super Clear LCD                                  1
Name: count, Length: 317, dtype: int64

In [227]:
def categorize_display_type(display_type):
    display_type = display_type.lower()
    if 'retina' in display_type:
        return 'Retina'# Lowercase to standardize the input
    elif 'oled' in display_type or 'amoled' in display_type:
        return 'OLED'
    elif 'lcd' in display_type or 'tft' in display_type or 'ips' in display_type:
        return 'LCD'
   
    else:
        return 'Other'

# Apply the categorization function to the 'Display_type' column
data['Display Type'] = data['Display Type'].apply(categorize_display_type)

# Now you can inspect the distribution of your new categories
data['Display Type'].value_counts()


Display Type
LCD       1952
OLED       661
Retina      32
Other        6
Name: count, dtype: int64

In [228]:
data['OS'] = data['OS'].str.split().str[0]
os_mapping = {
    'MeeGo': 'Other',
    'Windows': 'Other',
    'Firefox': 'Other',
    'Linux': 'Other',
    'Mobile': 'Other',
    'FP1': 'Other',
    'Tizen': 'Other',
    'Anna': 'Other',
    'Phone':'Phone',
    'HarmonyOS': 'HarmonyOS',
    'Android': 'Android',
    'iOS': 'iOS',
    'Symbian': 'Symbian',
    'Belle': 'Belle',
    'KaiOS': 'KaiOS',
    'OS': 'OS',
    'iPadOS': 'iPadOS',
    'platform': 'platform',
    'EMUI': 'EMUI',
}
data['OS'] = data['OS'].map(os_mapping)

In [229]:
sensors_of_interest = ['accelerometer', 'gyro', 'proximity', 'barometer', 'fingerprint']

for sensor in sensors_of_interest:
    data[f'sensor_{sensor}'] = 0
def update_sensor_flags(row):
    sensor_data = str(row['Sensors']).lower() 
    for sensor in sensors_of_interest:
        if sensor in sensor_data:
            row[f'sensor_{sensor}'] = 1
    return row

data = data.apply(update_sensor_flags, axis=1)
data = data.drop('Sensors', axis=1)

In [230]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2651 entries, 0 to 2650
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   brand                 2651 non-null   object 
 1   2G                    2651 non-null   bool   
 2   3G                    2651 non-null   bool   
 3   4G                    2651 non-null   bool   
 4   5G                    2651 non-null   bool   
 5   Announced             2651 non-null   float64
 6   Status                2651 non-null   object 
 7   Weight                2651 non-null   float64
 8   Length                2651 non-null   float64
 9   Width                 2651 non-null   float64
 10  Diameter              2651 non-null   float64
 11  SIM                   2651 non-null   object 
 12  Display Type          2651 non-null   object 
 13  Display Size          2651 non-null   float64
 14  ppi                   2651 non-null   float64
 15  body ratio           

In [231]:
data['Chipset'] = data['Chipset'].replace('5', np.nan)
data['Chipset'].fillna('Unknown', inplace=True)
def categorize_chipset(chipset):
    chipset = chipset.lower()
    if 'snapdragon' in chipset or 'qualcomm' in chipset:
        return 'Qualcomm'
    elif 'mediatek' in chipset or 'mt' in chipset:
        return 'MediaTek'
    elif 'apple' in chipset:
        return 'Apple'
    elif 'exynos' in chipset:
        return 'Samsung Exynos'
    elif 'intel' in chipset:
        return 'Intel'
    elif 'nvidia' in chipset:
        return 'Nvidia'
    else:
        return 'Other'
data['Chipset Manufacturer'] = data['Chipset'].apply(categorize_chipset)
data.drop('Chipset', axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Chipset'].fillna('Unknown', inplace=True)


In [232]:
data.groupby('Announced')['CPU'].count()

Announced
2010.0      2
2011.0     46
2012.0    143
2013.0    179
2014.0    294
2015.0    238
2016.0    199
2017.0    159
2018.0    190
2019.0    207
2020.0    236
2021.0    186
2022.0    176
2023.0    103
2024.0     14
Name: CPU, dtype: int64

In [233]:
data.groupby('Announced')['CPU'].apply(lambda x: x.mode()[0])

Announced
2010.0    2.0
2011.0    2.0
2012.0    2.0
2013.0    2.0
2014.0    4.0
2015.0    4.0
2016.0    4.0
2017.0    8.0
2018.0    8.0
2019.0    8.0
2020.0    8.0
2021.0    8.0
2022.0    8.0
2023.0    8.0
2024.0    8.0
Name: CPU, dtype: float64

In [234]:
most_frequent_cpu_by_year = data.groupby('Announced')['CPU'].agg(lambda x: pd.Series.mode(x)[0])
for year, cpu in most_frequent_cpu_by_year.items():
    data.loc[(data['Announced'] == year) & (data['CPU'].isnull()), 'CPU'] = cpu


In [235]:
data.head()

Unnamed: 0,brand,2G,3G,4G,5G,Announced,Status,Weight,Length,Width,...,pixel,GPU,RAM,Storage,sensor_accelerometer,sensor_gyro,sensor_proximity,sensor_barometer,sensor_fingerprint,Chipset Manufacturer
0,alcatel,True,True,True,False,2022.0,Available,172.0,146.7,71.9,...,1036800.0,PowerVR GE8300,2.0,32.0,1,0,0,0,0,MediaTek
1,alcatel,True,True,True,False,2021.0,Available,190.0,156.4,74.8,...,1123200.0,IMG8322,2.0,32.0,1,0,1,0,1,Other
2,alcatel,True,True,True,False,2021.0,Available,134.0,137.6,65.7,...,460800.0,PowerVR GE8100,1.0,8.0,1,0,1,0,0,MediaTek
3,alcatel,True,True,True,False,2021.0,Available,194.0,165.6,75.6,...,1152000.0,PowerVR GE8320,4.0,64.0,1,0,1,0,1,MediaTek
4,alcatel,True,True,True,False,2021.0,Available,190.0,165.6,75.6,...,1152000.0,PowerVR GE8320,3.0,32.0,1,0,1,0,1,MediaTek


In [236]:
bool_columns = data.select_dtypes(include=['bool']).columns
data[bool_columns] = data[bool_columns].astype(int)


In [238]:
columns_to_encode = ['brand', 'SIM', 'Display Type', 'OS', 'Chipset Manufacturer']
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data[columns_to_encode]).toarray()
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))
data = pd.concat([data.drop(columns=columns_to_encode), encoded_df], axis=1)

In [239]:
columns_to_encode = ['Announced', 'Status', 'CPU','RAM','Storage']
for column in columns_to_encode:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

data.head()

Unnamed: 0,2G,3G,4G,5G,Announced,Status,Weight,Length,Width,Diameter,...,OS_iOS,OS_iPadOS,OS_platform,Chipset Manufacturer_Apple,Chipset Manufacturer_Intel,Chipset Manufacturer_MediaTek,Chipset Manufacturer_Nvidia,Chipset Manufacturer_Other,Chipset Manufacturer_Qualcomm,Chipset Manufacturer_Samsung Exynos
0,1,1,1,0,12,0,172.0,146.7,71.9,10.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,1,1,0,11,0,190.0,156.4,74.8,9.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,1,1,0,11,0,134.0,137.6,65.7,9.8,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1,1,1,0,11,0,194.0,165.6,75.6,8.7,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1,1,1,0,11,0,190.0,165.6,75.6,8.8,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [240]:
le = LabelEncoder()
data['GPU'] = le.fit_transform(data['GPU'])

In [241]:
from sklearn.ensemble import IsolationForest
if_ = IsolationForest()
data['is_outlier']= if_.fit_predict(data)
data['is_outlier'].value_counts()

is_outlier
 1    2493
-1     158
Name: count, dtype: int64

In [242]:
data= data.loc[data['is_outlier'] == 1]

In [243]:
data.head()

Unnamed: 0,2G,3G,4G,5G,Announced,Status,Weight,Length,Width,Diameter,...,OS_iPadOS,OS_platform,Chipset Manufacturer_Apple,Chipset Manufacturer_Intel,Chipset Manufacturer_MediaTek,Chipset Manufacturer_Nvidia,Chipset Manufacturer_Other,Chipset Manufacturer_Qualcomm,Chipset Manufacturer_Samsung Exynos,is_outlier
0,1,1,1,0,12,0,172.0,146.7,71.9,10.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,1,1,1,0,11,0,190.0,156.4,74.8,9.7,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
2,1,1,1,0,11,0,134.0,137.6,65.7,9.8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
3,1,1,1,0,11,0,194.0,165.6,75.6,8.7,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
4,1,1,1,0,11,0,190.0,165.6,75.6,8.8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [244]:
X = data.drop('Price', axis=1) 
y = data['Price']     

In [245]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

LinearRegression

In [246]:
LR = LinearRegression()
LR.fit(X_train , y_train)
y_pred = LR.predict(X_test)
y_pred

array([380.6118488 , 500.36115433, 420.00358879, 113.55433648,
       143.60273111,  99.39918944, 170.35075195,   2.65041353,
       214.06687433,  -2.81636977, 209.63400799, 158.6507366 ,
       209.94723263, 397.39452175, 189.15130713, 228.1201017 ,
       104.1057403 , 536.38613628, 155.17431921, 119.19648415,
       236.29528171, 296.38914172, 275.60969405, 243.85416785,
        97.35935841,  96.05832346, 106.56041768, 146.31683173,
        85.94787636,  75.91155105, 124.62890426,  98.76522019,
        96.99648358, 184.14331694, 324.54790751, 189.18506279,
       252.06964475, 534.18835002, 102.17905727, 157.32543627,
       276.52693361, 268.17874258,  91.45170522, 246.06159864,
       185.83470183, 244.92884887, 140.75324346, 283.40022914,
       150.4528278 , 137.8597405 , 200.16833526, 361.38677237,
       338.57650831, 219.62812275, 221.18976938, 276.94183681,
        11.29378098,  63.42594279, 179.25272981, 497.64862089,
       207.03174482,  49.83907042, 248.51102359,  77.87

In [247]:
LR.score(X_test, y_test)

0.5085007821047098

RANSACRegressor

In [248]:
model = RANSACRegressor()
model.fit(X_train , y_train)
y_pred = model.predict(X_test)
y_pred

array([ 364.78096541,  457.22376432,  323.84406773,   98.34495457,
        193.47690179,  135.21604747,  186.99659949, -131.61005552,
        113.10648041,    5.70256726,   38.46360893,  159.41105424,
        202.03054537,  410.2632185 ,   96.07695523,  200.08051321,
        124.48342917,  442.66936235,  183.61571188,  163.88215839,
        198.4969249 ,  354.88809735,  318.56198681,  243.73027344,
        117.21611678,   50.3974    ,  140.98528914,   67.54560444,
        116.23793918,  127.93476308,  169.84500822,  101.06813706,
         91.0260403 ,  207.91500848,  325.35912466,  194.55929255,
        258.95165106,  328.8281193 ,  140.04314627,  107.7619088 ,
        363.77055164,  297.99936166,  112.03026295,  276.50984393,
        190.38139447,  262.10693416,  195.72790249,  298.86038089,
        158.93716598,  134.49775554,  172.96960691,  413.41817919,
        296.18205774,  370.02773071,  236.92011108,  223.31905373,
        -18.99070857,   81.54084199,  238.83960117,  422.40857

In [249]:
model.score(X_test , y_test)

0.3738567841230771

PolynomialRegression

In [250]:
lin = LinearRegression()
poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.fit_transform(X_test)
poly.fit(X_poly_train , y_train)
lin.fit(X_poly_train , y_train)


In [254]:
y_pred = lin.predict(X_poly_test)
y_pred

array([ 314.82645444,  957.2716496 ,  350.28275118,  152.93157865,
        111.47150469,  112.62130583,  213.40014446,   94.60877038,
         94.16065679,  182.7430632 ,  283.54823024,  163.23913566,
        166.81179507,  216.49354369,  272.35214624,   76.21345855,
        186.72177103,  875.76287613,  226.96162872,  144.10256018,
        177.29563329,  314.39593268,  385.50119886,  262.14508433,
        164.68186992,   66.31631763,  146.94138138,  166.20247472,
        157.6854917 ,  111.43077377,  202.64958654,  161.242423  ,
         63.24465409,  253.06912919,  403.55835356,  140.77625631,
        198.05227018,  875.77109901,  139.03860631,   98.34521987,
        293.53907913,  338.31084061,   82.39314454,  319.79362757,
        344.70494497,   72.4458319 ,  161.59343343,  207.45985016,
        251.51515317,  200.64529521,  171.9318359 ,  500.71693486,
        363.99131969,  159.54561547,  139.55187883,  202.53466015,
         94.60611703,   85.2668924 ,  225.75928791,  647.39439

In [255]:
lin.score(X_poly_test, y_test)

0.33991700996292296

RandomForestRegressor

In [258]:
randf = RandomForestRegressor()
randf.fit(X_train , y_train)
y_pred = randf.predict(X_test)
y_pred

array([ 356.6       ,  692.4       ,  360.3243    ,  196.7676    ,
        133.75396667,  132.8       ,  189.9       ,  110.11666667,
        133.5867    ,  114.78333333,  179.1478    ,  200.9       ,
        160.34      ,  515.7486    ,  137.93333333,  124.4       ,
        138.4596    ,  951.2092    ,  202.7       ,   90.7992    ,
        247.9       ,  246.3       ,  355.92      ,  167.6       ,
        167.3995    ,  115.3865    ,  140.58333333,  110.775     ,
        103.8528    ,   94.65      ,  155.13333333,  101.025     ,
        116.45      ,  200.09051667,  402.6452    ,  162.9       ,
        198.83216667,  798.55096667,  137.3       ,  179.2       ,
        322.03625   ,  309.        ,   72.69305   ,  249.9       ,
        202.2       ,  225.3       ,  208.1504    ,  190.65      ,
        197.726925  ,  171.79740833,  219.4       ,  302.11033333,
        320.6       ,  214.46274286,  133.6       ,  308.92256667,
        102.9       ,   95.0468    ,  141.18333333,  463.2435 

In [260]:
randf.score(X_test , y_test)

0.6807236135239307

In [261]:
#Number of trees
n_estimator = [int(x) for x in np.linspace(start = 10 ,stop = 80, num = 10)]
#Number of features to consider at every split
max_features = ['log2' , 'sqrt']
#Maximum samples required
max_depth =  [int(x) for x in np.linspace(start = 10 ,stop = 100, num = 5)]
#Minimum samples required to split the node
min_samples_split = [int(x) for x in np.linspace(start = 5 ,stop = 60, num = 5)]
#Minimum number of samples required at each leaf
min_samples_leaf = [int(x) for x in np.linspace(start = 1 ,stop = 10, num = 2)]
#Method of selecting samples for training each tree
bootstrap = [True , False]

In [262]:
#creating the param grid
param_grid = {
    'n_estimators': n_estimator,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap':bootstrap
}


In [181]:
rf_Model2 = RandomForestRegressor()

In [263]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator= rf_Model2 , param_grid= param_grid , cv=3 , verbose=2 , n_jobs=4)

In [264]:
rf_Grid.fit(X_poly_train,y_train)

Fitting 3 folds for each of 2000 candidates, totalling 6000 fits


In [265]:
rf_Grid.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 64}

In [271]:
randf3 = RandomForestRegressor(bootstrap=True, max_depth=100 , max_features='sqrt',min_samples_leaf=1,min_samples_split= 5,n_estimators=64)
randf3.fit(X_train , y_train)
y_pred = randf3.predict(X_test)
y_pred

array([ 368.54251364,  663.01998375,  387.34098971,  192.81837054,
        131.36073189,  134.50229415,  212.96328534,   92.4311756 ,
        159.31061502,  119.41012401,  198.85570517,  183.80964782,
        168.23978795,  524.31871695,  157.67404514,  125.60894097,
        122.39918942,  899.94378323,  193.4828869 ,   99.42869544,
        244.93092758,  243.46074108,  304.35763889,  174.33991139,
        166.80449672,  105.54480804,  144.36464534,  110.19258433,
        106.721735  ,   93.16468254,  175.51822917,  103.30722966,
        132.71387987,  220.26841977,  352.37394593,  160.31795635,
        200.98757329,  849.88913264,  139.20864335,  161.11959641,
        358.85628094,  306.34596726,   73.47353981,  253.84380952,
        229.37686012,  264.23785962,  190.05764002,  172.12146577,
        216.74243256,  166.31531696,  234.77610367,  298.13551708,
        295.51829117,  243.49355159,  144.86991567,  327.30127976,
        109.06863839,   92.90336421,  153.05549355,  458.47211

In [272]:
randf3.score(X_test , y_test)

0.718692646781053

In [273]:
lasso = Lasso()
lasso.fit(X_poly_train,y_train)
y_pred = lasso.predict(X_poly_test)
y_pred

  model = cd_fast.enet_coordinate_descent(


array([ 407.78699312,  765.23506429,  268.08725852,  196.07727895,
        187.75242382,  157.70874439,  132.26848739,   68.36957488,
        134.44617827,   84.32494014,  117.74780557,  169.2684389 ,
        170.14864229,  571.14510995,  120.57352189,  119.65960247,
        103.15316125,  808.04990501,  165.35457018,  140.40521906,
        158.58678033,  257.37713244,  270.67929217,  227.99354474,
        194.2148091 ,   99.78954933,  126.46375966,  175.21653044,
         59.75152091,   99.0390936 ,  216.21448374,   66.13266698,
        136.0416934 ,  174.55863759,  307.06016751,  132.94573005,
        212.57745281,  806.54473986,  149.71700372,  110.73402362,
        326.3424572 ,  294.41907497,   73.39039512,  256.21496617,
        220.2415578 ,  140.46132486,  184.04179089,  206.59299603,
        150.7812084 ,  143.10953155,  161.1723008 ,  359.7139428 ,
        444.9069368 ,  243.42081231,  154.69898344,  205.16788794,
         78.57086435,  106.34076017,  180.29302711,  702.98320

In [274]:
lasso.score(X_poly_test , y_test)

0.637955733231679

In [275]:
gbr = GradientBoostingRegressor()

In [278]:
gbr.fit(X_train , y_train)

In [279]:
gbr.score(X_test , y_test)

0.6650827531277617

In [280]:
#Number of trees
n_estimator = [int(x) for x in np.linspace(start = 10 ,stop = 80, num = 10)]
#Number of features to consider at every split
a = [0.1 , 0.01 , 0.001 , 0.0001 , 0.00001]
#Maximum samples required
max_depth =  [int(x) for x in np.linspace(start = 10 ,stop = 200, num = 10)]


In [281]:
param_grid = {
    'n_estimators': n_estimator,
    'learning_rate': a,
    'max_depth': max_depth,
    
}

In [200]:
gdr2 = GradientBoostingRegressor()

In [287]:
from sklearn.model_selection import GridSearchCV
gdr_Grid = GridSearchCV(estimator= gdr2 , param_grid= param_grid , cv=5 , verbose=2 , n_jobs=4)

In [288]:
gdr_Grid.fit(X_train,y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


KeyboardInterrupt: 

In [285]:
gdr_Grid.best_params_

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 48}

In [286]:
gdr_Grid.best_score_

0.5969471240181089