In [None]:
# | Data Analysis Project

# Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetics for the plots
sns.set(style="whitegrid")

# Load the Dataset
data = pd.read_csv('data/your_data.csv')  # Adjust path if needed

# Data Understanding
print("Dataset Info:")
data.info()

print("\nStatistical Summary:")
print(data.describe())

print("\nMissing Values Count:")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])  # Display only columns with missing values

# Data Cleaning
# 1. Handling Missing Values
# Example: Filling missing values with the median for numerical columns
data['column_name'].fillna(data['column_name'].median(), inplace=True)  # Adjust column_name

# Example: Dropping rows with missing values (if applicable)
data.dropna(inplace=True)

# 2. Removing Duplicates
duplicates = data.duplicated().sum()
print(f'Duplicate rows: {duplicates}')
data.drop_duplicates(inplace=True)

# 3. Outlier Detection (if needed)
# Example using IQR to detect outliers
Q1 = data['engagement'].quantile(0.25)  # Adjust column name for engagement
Q3 = data['engagement'].quantile(0.75)
IQR = Q3 - Q1

# Filtering out outliers
data = data[~((data['engagement'] < (Q1 - 1.5 * IQR)) | (data['engagement'] > (Q3 + 1.5 * IQR)))]

# Exploratory Data Analysis (EDA)
# Distribution of Engagement
plt.figure(figsize=(10, 6))
sns.histplot(data['engagement'], bins=30, kde=True)  # Adjust column name
plt.title('Distribution of Engagement')
plt.xlabel('Engagement')
plt.ylabel('Frequency')
plt.show()

# Correlation Matrix
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Comparing Engagement by Category
plt.figure(figsize=(12, 6))
sns.boxplot(x='category', y='engagement', data=data)  # Adjust column names
plt.title('Engagement by Category')
plt.xlabel('Category')
plt.ylabel('Engagement')
plt.xticks(rotation=45)
plt.show()

# Summary of Findings (You can print this out)
findings_summary = """
1. The distribution of engagement is positively skewed, indicating that a few posts have significantly high engagement.
2. High engagement is observed in categories such as "X" and "Y", which suggests that specific content types perform better.
3. There are strong correlations between engagement and metrics such as likes, shares, and other relevant features.
"""

print(findings_summary)

# Recommendations (You can print this out)
recommendations_summary = """
Based on the findings from the analysis, the following recommendations are made:
- Content Strategy: Focus on creating more content within categories that exhibit higher engagement levels. This may include analyzing successful posts within those categories to inform future content creation.
- Optimal Posting Times: Conduct further analysis to determine the best times and days to post content. Timing can significantly influence engagement rates.
- Engagement Enhancement: Experiment with content types that generate higher likes and shares. Consider leveraging user-generated content, polls, or interactive formats to increase engagement.
- Regular Monitoring: Establish a routine for monitoring engagement metrics. Regular analysis can help quickly identify trends and adapt strategies accordingly.
"""

print(recommendations_summary)