<h1 style="color:orange; text-align:left;size:10">User Behavior Analysis for Optimizing Engagement on Social Media Platforms </h1>


In [None]:
import pandas as pd

# Load the dataset to inspect its structure and contents
file_path = "Social_Media_User_Activity_Dataset.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Set plot style
plt.style.use("ggplot")
# Analyze overall engagement metrics
engagement_metrics = ["Posts_Created", "Comments_Made", "Messages_Sent", "Likes_Given", "Shares_Made"]
engagement_data = df[engagement_metrics].sum()
# Plot engagement distribution
plt.figure(figsize=(10, 5))
sns.barplot(x=engagement_data.index, y=engagement_data.values)
plt.xlabel("Engagement Metrics")
plt.ylabel("Total Count")
plt.title("Overall User Engagement on Social Media Platform")
plt.xticks(rotation=30)
plt.show()
# Display engagement statistics
df_engagement_stats = df[engagement_metrics].describe()
print("Engagement Statistics:\n", df[engagement_metrics].describe())

In [None]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
# Check for statistical outliers using basic descriptive statistics
summary_statistics = df.describe()
missing_values, summary_statistics

In [None]:
# Bar chart: Average engagement per content type
avg_engagement_by_content = df.groupby("Preferred_Content_Type")[["Posts_Created", "Comments_Made", "Messages_Sent", "Likes_Given", "Shares_Made"]].mean()

plt.figure(figsize=(10, 5))
avg_engagement_by_content.plot(kind="bar", figsize=(12, 6))
plt.xlabel("Preferred Content Type")
plt.ylabel("Average Engagement Metrics")
plt.title("Average Engagement Metrics by Preferred Content Type")
plt.xticks(rotation=30)
plt.legend(title="Engagement Metrics")
plt.show()


In [None]:

# Bar Chart: Comparing user activity metrics across different user categories
activity_metrics = ['Posts_Created', 'Comments_Made', 'Messages_Sent', 'Likes_Given', 'Shares_Made']
df_grouped = df.groupby('User_Category')[activity_metrics].mean()
df_grouped.plot(kind='bar', figsize=(10, 6), edgecolor='black')
plt.title("Average User Activity Metrics by User Category")
plt.xlabel("User Category")
plt.ylabel("Average Count")
plt.xticks(rotation=45)
plt.legend(title="Activity Type")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
# Reload necessary libraries after execution state reset
import pandas as pd
import numpy as np
# حذف داده‌های تکراری
df = df.drop_duplicates()

# جایگزینی داده‌های گم‌شده (در صورت وجود)
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].median(), inplace=True)  # استفاده از میانه برای داده‌های عددی

for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)  # جایگزینی مقدار پرتکرار برای داده‌های دسته‌بندی‌شده

# انتخاب ستون‌های عددی برای بررسی نقاط پرت
columns_to_check = ["Posts_Created", "Comments_Made", "Messages_Sent", 
                    "Likes_Given", "Shares_Made", "Time_Spent_per_Day (minutes)"]

# شناسایی نقاط پرت با استفاده از روش IQR (Interquartile Range)
Q1 = df[columns_to_check].quantile(0.25)
Q3 = df[columns_to_check].quantile(0.75)
IQR = Q3 - Q1

# حذف نقاط پرت (داده‌هایی که خارج از 1.5 برابر IQR هستند)
df_cleaned = df[~((df[columns_to_check] < (Q1 - 1.5 * IQR)) | (df[columns_to_check] > (Q3 + 1.5 * IQR))).any(axis=1)]

# نمایش نمونه‌ای از داده‌های تمیز شده
df_cleaned.head()



In [None]:
print(df.columns)


<h1 style="color:orange; text-align:left;size:10">missing value </h1>


In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
print("Columns in dataset before processing:\n", df.columns)
df.columns = df.columns.str.strip()
#missing value
missing_values = df.isnull().sum()
print("\nMissing values before imputation:\n", missing_values)
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.show()

if 'Time_Spent_per_Day' in df.columns and 'Time_Spent_per_Day (minutes)' in df.columns:
    df['Time_Spent_per_Day'].fillna(df['Time_Spent_per_Day (minutes)'].mean(), inplace=True)
elif 'Time_Spent_per_Day (minutes)' in df.columns:
    df.rename(columns={'Time_Spent_per_Day (minutes)': 'Time_Spent_per_Day'}, inplace=True)
    df['Time_Spent_per_Day'].fillna(df['Time_Spent_per_Day'].mean(), inplace=True)

imputer = KNNImputer(n_neighbors=5)
if 'Time_Spent_per_Day' in df.columns and 'Likes_Given' in df.columns:
    df[['Time_Spent_per_Day', 'Likes_Given']] = imputer.fit_transform(df[['Time_Spent_per_Day', 'Likes_Given']])

# (Outliers)IQR
if 'Time_Spent_per_Day' in df.columns:
    Q1 = df['Time_Spent_per_Day'].quantile(0.25)
    Q3 = df['Time_Spent_per_Day'].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df['Time_Spent_per_Day'] < (Q1 - 1.5 * IQR)) | 
              (df['Time_Spent_per_Day'] > (Q3 + 1.5 * IQR)))]
# (Scaling)
scaler = MinMaxScaler()
if 'Time_Spent_per_Day' in df.columns and 'Likes_Given' in df.columns:
    df[['Time_Spent_per_Day', 'Likes_Given']] = scaler.fit_transform(df[['Time_Spent_per_Day', 'Likes_Given']])

print("\n✅ Data processing completed successfully!")
print("Columns in dataset after processing:\n", df.columns)

<h1 style="color:orange; text-align:left;size:10">confusion_matrix </h1>


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

y_true = np.array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0])  
y_pred_rf = np.array([1, 0, 1, 1, 0, 1, 0, 1, 1, 0])  # Random Forest
y_pred_svm = np.array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0])  #ط SVM

cm_rf = confusion_matrix(y_true, y_pred_rf)
cm_svm = confusion_matrix(y_true, y_pred_svm)
def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Highly Active", "Moderately Active"], yticklabels=["Highly Active", "Moderately Active"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(title)
    plt.show()
# Random Forest
plot_confusion_matrix(cm_rf, "Confusion Matrix - Random Forest")
# SVM
plot_confusion_matrix(cm_svm, "Confusion Matrix - SVM")


In [None]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Standardizing the dataset again
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[['Posts_Created', 'Comments_Made', 'Messages_Sent', 'Likes_Given', 
                                    'Shares_Made',  'Active_Days_per_Week']])
# Reapplying K-Means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['KMeans_Cluster'] = kmeans.fit_predict(X_scaled)
# Reapplying DBSCAN clustering with adjusted parameters
dbscan = DBSCAN(eps=1.5, min_samples=10)
df['DBSCAN_Cluster'] = dbscan.fit_predict(X_scaled)
# Applying PCA for visualization (reducing dimensions to 2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Adding PCA components to the dataframe for visualization
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]
# Visualizing K-Means Clusters
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df['PCA1'], y=df['PCA2'], hue=df['KMeans_Cluster'], palette='viridis', alpha=0.7)
plt.title("K-Means Clustering Visualization")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()
# Visualizing DBSCAN Clusters
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df['PCA1'], y=df['PCA2'], hue=df['DBSCAN_Cluster'], palette='tab10', alpha=0.7)
plt.title("DBSCAN Clustering Visualization")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()

<h1 style="color:orange; text-align:left;size:10">User Behavior Analysis for Optimizing Engagement on Social Media Platforms </h1>


In [None]:
# Ensuring the correct column name for Time Spent per Day is used
correct_numeric_cols = ['Posts_Created', 'Comments_Made', 'Messages_Sent', 'Likes_Given', 
                        'Shares_Made', 'Time_Spent_per_Day (minutes)', 'Active_Days_per_Week']
# Ensuring all numeric columns are properly converted
df[correct_numeric_cols] = df[correct_numeric_cols].apply(pd.to_numeric, errors='coerce')
# Calculate IQR for Outlier Detection
Q1 = df[correct_numeric_cols].quantile(0.25)
Q3 = df[correct_numeric_cols].quantile(0.75)
IQR = Q3 - Q1
# Identifying outliers
outliers = ((df[correct_numeric_cols] < (Q1 - 1.5 * IQR)) | (df[correct_numeric_cols] > (Q3 + 1.5 * IQR))).sum()
# Summary statistics
summary_stats = df[correct_numeric_cols].describe()
# Visualizing distributions of numerical columns
df[correct_numeric_cols].hist(bins=20, figsize=(12, 8), grid=False, color='skyblue', edgecolor='black')
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.show()
# Boxplot for detecting outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[correct_numeric_cols], orient='h', palette="coolwarm")
plt.title("Boxplot of Numerical Features")
plt.show()
# Display engagement statistics
summary_stats, outliers


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Encode categorical features
label_encoder = LabelEncoder()
df["Preferred_Content_Type"] = label_encoder.fit_transform(df["Preferred_Content_Type"])
df["User_Category"] = label_encoder.fit_transform(df["User_Category"])  # Target variable encoding

# Define features and target variable
X = df.drop(columns=["User_ID", "User_Category"])
y = df["User_Category"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Train SVM Classifier
svm_model = SVC(kernel="linear")
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)

# Evaluate models
rf_acc = accuracy_score(y_test, rf_preds)
svm_acc = accuracy_score(y_test, svm_preds)
rf_report = classification_report(y_test, rf_preds)
svm_report = classification_report(y_test, svm_preds)

rf_acc, svm_acc, rf_report, svm_report





In [None]:
# Convert categorical columns to numeric values for correlation analysis
df_encoded = df_cleaned.copy()

# Identify categorical columns
categorical_columns = df_encoded.select_dtypes(include=['object']).columns

# Encode categorical variables using label encoding
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df_encoded[col] = label_encoders[col].fit_transform(df_encoded[col])

# Compute correlation matrix again with numerical values
correlation_matrix = df_encoded.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of User Activity Data (Encoded)")
plt.show()

# Display the first few rows of the encoded dataset to verify changes
df_encoded.head()


In [None]:
# Selecting only numerical columns for analysis
numeric_cols = ['Posts_Created', 'Comments_Made', 'Messages_Sent', 'Likes_Given', 
                'Shares_Made', 'Time_Spent_per_Day', 'Active_Days_per_Week']

# Ensure all numeric columns are properly converted 
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Calculate IQR for Outlier Detection
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Identifying outliers
outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()

# Display summary statistics
summary_stats = df[numeric_cols].describe()

# Visualizing distributions of numerical columns
plt.figure(figsize=(12, 6))
df[numeric_cols].hist(bins=20, figsize=(12, 8), grid=False, color='skyblue', edgecolor='black')
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.show()

# Boxplot for detecting outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[numeric_cols], orient='h', palette="coolwarm")
plt.title("Boxplot of Numerical Features")
plt.show()

# Display engagement statistics
summary_stats, outliers


In [None]:
# Define labels and accuracy values
models = ["Random Forest", "SVM"]
accuracies = [rf_acc * 100, svm_acc * 100]

# Create bar chart
plt.figure(figsize=(8, 5))
plt.bar(models, accuracies, color=['blue', 'green'])
plt.ylim(80, 100)  # Set y-axis range for better visualization
plt.ylabel("Accuracy (%)")
plt.title("Model Comparison: Random Forest vs SVM")
plt.text(0, rf_acc * 100 + 0.5, f"{rf_acc * 100:.1f}%", ha='center', fontsize=12)
plt.text(1, svm_acc * 100 + 0.5, f"{svm_acc * 100:.1f}%", ha='center', fontsize=12)
plt.show()


In [None]:
# Reload the dataset to ensure correctness
df = pd.read_csv(file_path)

# Check column names
df.columns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Drop User_ID as it's not needed for modeling
df = df.drop(columns=['User_ID'])

# Encode categorical features
label_encoders = {}
for col in ['Preferred_Content_Type', 'User_Category']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split into features (X) and target (y)
X = df.drop(columns=['User_Category'])
y = df['User_Category']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize numerical features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Train SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_preds = svm_model.predict(X_test_scaled)

# Evaluate models
rf_accuracy = accuracy_score(y_test, rf_preds)
svm_accuracy = accuracy_score(y_test, svm_preds)

rf_report = classification_report(y_test, rf_preds, target_names=label_encoders['User_Category'].classes_)
svm_report = classification_report(y_test, svm_preds, target_names=label_encoders['User_Category'].classes_)

rf_accuracy, svm_accuracy, rf_report, svm_report


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Compute confusion matrices
rf_cm = confusion_matrix(y_test, rf_preds)
svm_cm = confusion_matrix(y_test, svm_preds)

# Plot confusion matrix for Random Forest
plt.figure(figsize=(6, 4))
sns.heatmap(rf_cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoders['User_Category'].classes_, yticklabels=label_encoders['User_Category'].classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()

# Plot confusion matrix for SVM
plt.figure(figsize=(6, 4))
sns.heatmap(svm_cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoders['User_Category'].classes_, yticklabels=label_encoders['User_Category'].classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - SVM")
plt.show()
