# Predicting Article Popularity Based on Reading Time and User Responses
### We used both Linear Regression and a Deep Learning Neural Network to predict the number of claps an article receives, based on features like reading time and user responses.

# Project Objective:
The goal of this project is to predict the popularity of Medium articles by estimating the number of claps they receive.

### 1- Initial Model:
We started by using a Linear Regression model to analyze the relationship between features such as reading time, responses, and the number of claps.
### 2- Improved Model:
Next, we applied a Random Forest Regressor, which provided better performance and higher prediction accuracy.
### 3- Final Model:
Finally, we used a Deep Neural Network (DNN) to further improve the results and leverage its ability to capture complex patterns in the data.

# Project Workflow and Results:
The project involved data preprocessing, model training, and performance evaluation using Mean Absolute Error (MAE). The results demonstrated that the DNN outperformed the other models in prediction accuracy.
### by: kyrollos

# 1- Data Preprocessing - Exploration

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

cp = sns.color_palette()
sns.set_style('darkgrid')
plt.style.use('ggplot')
pd.set_option('max_rows', None)

# Machine learning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error , r2_score

#Deeo Learning


In [None]:
data = pd.read_csv('/kaggle/input/medium-articles-dataset/medium_data.csv', parse_dates = ['date'],dayfirst = True)
data.head()
data = data.sort_values('date').reset_index(drop = True)

low, high= data["claps"].quantile([0.1,0.95])
data=  data[data["claps"].between (low, high)]

data.head()

In [None]:
data.isna().sum()

In [None]:
data["claps"].describe()

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(data["claps"] , color='skyblue', edgecolor='blue')
plt.show()

In [None]:
data.info()

In [None]:
data["publication"].value_counts()

In [None]:
plt.figure(figsize = (8, 4))
ax = data.publication.value_counts().plot(kind = 'bar', color = cp)

plt.xticks(rotation = 15)
labels = data.publication.value_counts().values
plt.bar_label(ax.containers[0], labels , label_type='edge')
plt.show()

In [None]:
plt.figure(figsize = (20, 4))
sns.histplot(data = data, x = 'reading_time', kde = True ,color='blue', edgecolor="black", bins = 40)
plt.show() 

In [None]:
plt.figure(figsize = (20, 4))
data.groupby('date').sum()['claps'].plot(kind = 'line')
plt.show()
# The effect of .sum() here:
# If there are multiple articles on the same date, it sums up the claps for all articles on that day,
# so the plot shows the total daily claps instead of individual article claps.

In [None]:
plt.figure(figsize = (20, 4))
data.groupby('date').mean()['reading_time'].plot(kind = 'line')
plt.show()
# This calculates the average reading time for each day,
# showing the typical reading time of articles published on that date.

In [None]:
data.groupby('publication').sum()

In [None]:
data.publication.value_counts()

In [None]:
data.groupby('publication').sum()['reading_time']
# each publication Vs  how long time

In [None]:
plt.figure(figsize = (10, 4))
ax = data.groupby('publication').sum()['reading_time'].plot(kind = 'bar', title = 'Totol Reading Time distribution', color = cp)
plt.xticks(rotation = 15)
labels = data.groupby('publication').sum()['reading_time'].values
plt.bar_label(ax.containers[0], labels = labels, label_type='edge')
plt.show()

In [None]:
plt.figure(figsize = (10, 4))
ax = data.groupby('publication').sum()['responses'].plot(kind = 'bar', title = 'Totol Responses distribution', color = cp)
plt.xticks(rotation = 15)
labels = data.groupby('publication').sum()['responses'].values
plt.bar_label(ax.containers[0], labels = labels, label_type='edge')
plt.show()

In [None]:
plt.figure(figsize = (10, 4))
ax = data.groupby('publication').sum()['claps'].plot(kind = 'bar', title = 'Totol Claps distribution', color = cp)
plt.xticks(rotation = 15)
labels = data.groupby('publication').sum()['claps'].values
plt.bar_label(ax.containers[0], labels = labels, label_type='edge')
plt.show()

In [None]:
# احسب الارتباط مع claps وارتب القيم
correlation = data.corr()['claps'].sort_values(ascending=False)

# ارسم القيم دي باستخدام bar chart
correlation.plot(kind='bar', color=cp)
plt.title('Feature Correlation with Claps')
plt.ylabel('Correlation Coefficient')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 2- Feature Selection

In [None]:
# Select features and target
# We'll use 'reading_time' and 'responses' as features to predict 'claps'
features = ['reading_time', 'responses']
X = data[features] # feature matrix
y= data['claps']  # predect vector

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print( "X shape :", X_train.shape)
X_train.head(4)

In [None]:
print( "y shape :", y_train.shape)
y_train.head(4)

# 3- ML Model Training

### Model 1 : LinearRegression 

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)

In [None]:
#LinearRegression
y_pred_training = model.predict(X_train)
y_pred_training[:5]

In [None]:
#LinearRegression
mae_training = mean_absolute_error(y_train,y_pred_training)
print("Training MAE:", round(mae_training, 2))

r2 = r2_score(y_train, y_pred_training)
print("R² score:", round(r2, 2))

In [None]:
intercept = round(model.intercept_)
coefficient =  model.coef_ 

print(f" claps = {intercept} +{round(coefficient[0])} * reading_time + { round(coefficient[1]) } * responses ")

### Model 2 : RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
##RandomForestRegressor
y_pred_training = model.predict(X_train)
y_pred_training[:5]

In [None]:
#RandomForestRegressor
mae_training = mean_absolute_error(y_train,y_pred_training)
print("Training MAE:", round(mae_training, 2))

r2 = r2_score(y_train, y_pred_training)
print("R² score:", round(r2, 2))

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x= y_train,   y= y_pred_training , color= "blue")
plt.xlabel("Actual Claps")
plt.ylabel("Predicted Claps")
plt.title("Actual vs Predicted Claps")
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], '--g')  # ideal line
plt.show()

# 4- import libraries  - DNN

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Print dataset shapes
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

# 5- Deep Learning - Create Model - DNN

In [None]:
model= Sequential([ keras.layers.Input(shape=(2,)),
                   #keras.layers.Dense(32, activation='tanh'),
                   keras.layers.Dense(128, activation='relu'),
                   keras.layers.Dense(64, activation='relu'),
                   keras.layers.Dense(32, activation='relu'),
                   keras.layers.Dropout(0.1), 
                   keras.layers.Dense(1, activation='linear')  
])
# First hidden layer with 8 neurons - 'tanh' activation function - Dense = DNN
# Second hidden layer with 128 neurons
# Third hidden layer with 64 neurons
# Dropout for regularization
# Output layer with 1 neuron and 'Linear' activation for regression
    

In [None]:
model.compile(optimizer='adam', loss='mae')

model.fit(
    X_train, y_train,
    epochs=50,
    verbose=1,
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]
)


# 6- improve the model using StandardScaler - DNN

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)  # نحسب المتوسط والانحراف المعياري من التدريب
X_test_scaled = scaler.transform(X_test)        # نستخدم نفس القيم على الاختبار

model.fit(
    X_train_scaled, y_train,
    epochs=50,
    
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)],
    verbose=1
)


# 7- prediction accuracy - DNN

In [None]:
from sklearn.metrics import mean_absolute_error

y_test_pred = model.predict(X_test_scaled)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Test MAE:", round(test_mae, 2))


In [None]:
sns.scatterplot(x= y_test,   y= y_test_pred.flatten() , color= "blue")
plt.xlabel("Actual Claps")
plt.ylabel("Predicted Claps")
plt.title("Actual vs Predicted Claps")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--g')  # ideal line
plt.show()

In [None]:
plt.figure(figsize=(22,3))
plt.plot(y_test.values, label="Actual Claps")
plt.plot(y_test_pred, label="Predicted Claps")
plt.title("Actual vs Predicted Claps")
plt.legend()
plt.show()
