In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler

In [3]:
online_news_raw_data = os.path.join(".", "Database Files", "OnlineNewsPopularity.csv")

In [4]:
online_news_df = pd.read_csv(online_news_raw_data)
online_news_df.head(10)

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505
5,http://mashable.com/2013/01/07/beewi-smart-toys/,731.0,10.0,370.0,0.559889,1.0,0.698198,2.0,2.0,0.0,...,0.136364,0.6,-0.195,-0.4,-0.1,0.642857,0.214286,0.142857,0.214286,855
6,http://mashable.com/2013/01/07/bodymedia-armba...,731.0,8.0,960.0,0.418163,1.0,0.549834,21.0,20.0,20.0,...,0.1,1.0,-0.224479,-0.5,-0.05,0.0,0.0,0.5,0.0,556
7,http://mashable.com/2013/01/07/canon-poweshot-n/,731.0,12.0,989.0,0.433574,1.0,0.572108,20.0,20.0,20.0,...,0.1,1.0,-0.242778,-0.5,-0.05,1.0,0.5,0.5,0.5,891
8,http://mashable.com/2013/01/07/car-of-the-futu...,731.0,11.0,97.0,0.670103,1.0,0.836735,2.0,0.0,0.0,...,0.4,0.8,-0.125,-0.125,-0.125,0.125,0.0,0.375,0.0,3600
9,http://mashable.com/2013/01/07/chuck-hagel-web...,731.0,10.0,231.0,0.636364,1.0,0.797101,4.0,1.0,1.0,...,0.1,0.5,-0.238095,-0.5,-0.1,0.0,0.0,0.5,0.0,710


In [5]:
# Clean data by only keeping columns we want to use
condensed_online_news_df = online_news_df.iloc[:,[3, 50, 53, 60]]
condensed_online_news_df.head(10)

Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity,shares
0,219.0,0.378636,-0.35,593
1,255.0,0.286915,-0.11875,711
2,211.0,0.495833,-0.466667,1500
3,531.0,0.385965,-0.369697,1200
4,1072.0,0.411127,-0.220192,505
5,370.0,0.35061,-0.195,855
6,960.0,0.402039,-0.224479,556
7,989.0,0.42772,-0.242778,891
8,97.0,0.566667,-0.125,3600
9,231.0,0.298413,-0.238095,710


In [6]:
# Bucket 'shares' column into categories so we can measure popularity
bins = [0, 500, 1000, 100000, 1000000]
labels = ["Less Shareable", "Somewhat Shareable", "Highly Shareable", "Extremely Shareable"]
condensed_online_news_df["Popularity"] = pd.cut(condensed_online_news_df[" shares"], bins, labels=labels)
condensed_online_news_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity,shares,Popularity
0,219.0,0.378636,-0.35,593,Somewhat Shareable
1,255.0,0.286915,-0.11875,711,Somewhat Shareable
2,211.0,0.495833,-0.466667,1500,Highly Shareable
3,531.0,0.385965,-0.369697,1200,Highly Shareable
4,1072.0,0.411127,-0.220192,505,Somewhat Shareable
5,370.0,0.35061,-0.195,855,Somewhat Shareable
6,960.0,0.402039,-0.224479,556,Somewhat Shareable
7,989.0,0.42772,-0.242778,891,Somewhat Shareable
8,97.0,0.566667,-0.125,3600,Highly Shareable
9,231.0,0.298413,-0.238095,710,Somewhat Shareable


In [15]:
del condensed_online_news_df[" shares"]
condensed_online_news_df.head(10)

Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity,Popularity
0,219.0,0.378636,-0.35,Somewhat Shareable
1,255.0,0.286915,-0.11875,Somewhat Shareable
2,211.0,0.495833,-0.466667,Highly Shareable
3,531.0,0.385965,-0.369697,Highly Shareable
4,1072.0,0.411127,-0.220192,Somewhat Shareable
5,370.0,0.35061,-0.195,Somewhat Shareable
6,960.0,0.402039,-0.224479,Somewhat Shareable
7,989.0,0.42772,-0.242778,Somewhat Shareable
8,97.0,0.566667,-0.125,Highly Shareable
9,231.0,0.298413,-0.238095,Somewhat Shareable


## Split the Data into Training and Testing Sets

In [16]:
# Create our features
X = condensed_online_news_df.drop("Popularity", axis=1)

X = pd.get_dummies(X)
# Create our target
y = condensed_online_news_df["Popularity"]
X

Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity
0,219.0,0.378636,-0.350000
1,255.0,0.286915,-0.118750
2,211.0,0.495833,-0.466667
3,531.0,0.385965,-0.369697
4,1072.0,0.411127,-0.220192
...,...,...,...
39639,346.0,0.333791,-0.260000
39640,328.0,0.374825,-0.211111
39641,442.0,0.307273,-0.356439
39642,682.0,0.236851,-0.205246


In [17]:
X.describe()

Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity
count,39644.0,39644.0,39644.0
mean,546.514731,0.353825,-0.259524
std,471.107508,0.104542,0.127726
min,0.0,0.0,-1.0
25%,246.0,0.306244,-0.328383
50%,409.0,0.358755,-0.253333
75%,716.0,0.411428,-0.186905
max,8474.0,1.0,0.0


In [18]:
# Check the balance of our target values
y.value_counts()

Highly Shareable       27162
Somewhat Shareable     11335
Less Shareable          1089
Extremely Shareable       58
Name: Popularity, dtype: int64

In [19]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

## Balanced Random Forest Classifer

In [20]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=128, random_state=1) 

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [21]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,5,7,1,1
Actual 1,2076,1693,1683,1339
Actual 2,66,64,80,62
Actual 3,790,667,766,611


In [22]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.2790393448289168

In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                           pre       rec       spe        f1       geo       iba       sup

Extremely Shareable       0.00      0.36      0.70      0.00      0.50      0.24        14
   Highly Shareable       0.70      0.25      0.76      0.37      0.44      0.18      6791
     Less Shareable       0.03      0.29      0.75      0.06      0.47      0.21       272
 Somewhat Shareable       0.30      0.22      0.80      0.25      0.42      0.16      2834

        avg / total       0.56      0.24      0.77      0.33      0.43      0.18      9911



In [24]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.34269224427148587, ' n_tokens_content'),
 (0.33632212801009714, ' avg_positive_polarity'),
 (0.32098562771841693, ' avg_negative_polarity')]