In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error

<h2> Feature Selection on OnlineNewsPopularity </h2>

<h3>Exercise</h3>


1. Load the "OnlineNewsPopularity.csv" dataset 
2. Drop the Column which isn't required


In [13]:
# read the csv file from the link provided
# drop the column that is not required from the dataset(url)
df = pd.read_csv("OnlineNewsPopularity.csv", delimiter=', ', engine="python")
df = df.drop(['url'], axis=1)

<h3>Exercise</h3>


1. Scale the data using a appropriate scaler and re-asign the column names after scaling.
2. The function below should return scaled result in the form of DataFrame


In [14]:
# hint: Use MinMaxScaler for scaling
scaler = MinMaxScaler()

In [15]:
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

5. Perform train_test_split

In [16]:
X = df.loc[:, :'abs_title_sentiment_polarity']
y = df['shares']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
y_train = y_train.astype('int64')

6. Write a function which returns the list of k-Best features where k being the number of features required

In [17]:
#use chi2
def get_k_best_features(X, y, k):
    k_best = SelectKBest(score_func=chi2, k=k)
    k_best.fit(X, y)
    return list(X.columns[k_best.get_support()])

selected_features = get_k_best_features(df_scaled, y, k=5)

Print the results

In [18]:
# return the top N features
print(selected_features)

['data_channel_is_entertainment', 'data_channel_is_world', 'weekday_is_saturday', 'is_weekend', 'shares']


<h2> Model selection on Algerian_forest_fires_dataset_UPDATE-1 dataset  </h2>
<h3>Exercise (Hint use Ridge and Lasso to compare the models.)</h3>

<p>Your task is to findout which of the above models is best suited for the given dataset and give reasons in this scenario. </p>
<p>Also, you need to give scenarios which each of these Models work better over the other.</p>


1. Load the dataset

In [19]:
# load the dataset and split into training and test sets
data = pd.read_csv("Algerian_forest_fires_dataset_UPDATE-1.csv", delimiter=',', engine="python")
data = pd.DataFrame(data)

# convert 'Classes' column to numeric (0 for 'not fire' and 1 for 'fire')
data['Classes'] = data['Classes'].str.strip()
data['Classes'] = data['Classes'].replace('not fire', 0)
data['Classes'] = data['Classes'].replace('fire', 1)

print(data.columns)

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes'],
      dtype='object')


In [20]:
df = data.loc[124:]
df = pd.concat([data.loc[:120], data.loc[125:]])
df['Classes'] = df['Classes'].astype(str).str.strip()

In [21]:
le = LabelEncoder() # label encoder
le.fit(df['Classes']) # fit the label encoder
df['Classes'] = le.transform(df['Classes']) # transform the labels

X = df.loc[:,:'Rain'] # Rain may be a good feature to predict fires
y = df['Classes'] # Classes is the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)

* Drop the unnecessary columns and use train_test_split

In [22]:
# Your task is to findout which of the above models is best suited for the given dataset and give reasons in this scenario.
# you need to give scenarios which each of these Models work better over the other
kbest = SelectKBest(score_func=chi2, k=5)
kbest.fit(X_train, y_train)

X_train_kbest = kbest.transform(X_train)
X_test_kbest = kbest.transform(X_test)

rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=5)
rfe.fit(X_train, y_train)

X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print("MSE for Lasso: {:.5f}".format(mse_lasso))
print("MSE for Ridge: {:.5f}".format(mse_ridge))

MSE for Lasso: 0.15973
MSE for Ridge: 0.14597


In this scenario, we have built two different feature selection models: a filter method based on correlation coefficient and a wrapper method based on recursive feature elimination with cross-validation. Now, we want to compare Lasso and Ridge regression models to see which one is better suited for this dataset.

Lasso regression is a linear regression model that adds an L1 penalty term to the loss function to encourage sparsity in the coefficients. It can be useful when we have many irrelevant or redundant features that we want to eliminate from the model. Lasso can perform well in situations where we expect only a few important features to have a strong effect on the target variable.

Ridge regression is another linear regression model that adds an L2 penalty term to the loss function. Unlike Lasso, Ridge regression does not necessarily encourage sparsity in the coefficients. It can be useful when we have many important features with small coefficients that we do not want to eliminate completely.

Therefore, in scenarios where we have many irrelevant or redundant features, Lasso regression may perform better than Ridge regression. On the other hand, in scenarios where we have many important features with small coefficients, Ridge regression may be more appropriate. Ultimately, the choice between Lasso and Ridge regression will depend on the specific characteristics of the dataset and the goals of the modeling project.