In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [None]:

# Define the categories
categories = ['Food', 'Travel', 'Fashion', 'Fitness', 'Music', 'Culture', 'Family', 'Health']

# Number of data points
n = 500

# Create the data dictionary
data = {
    'Date': pd.date_range('2021-01-01', periods=n),
    'Category': [random.choice(categories) for _ in range(n)],
    'Likes': np.random.randint(0, 10000, size=n)
}

# Create a pandas DataFrame from the data
df = pd.DataFrame(data)


In [None]:
# Display the first few rows of the DataFrame
print(df.head())

# Get some basic statistics about the data
print(df.describe())

# Check the data types of each column
print(df.info())

# Explore the distribution of likes
plt.figure(figsize=(10, 6))
sns.histplot(df['Likes'], kde=True)
plt.title('Distribution of Likes')
plt.xlabel('Likes')
plt.ylabel('Frequency')
plt.show()

# Analyze likes per category
plt.figure(figsize=(12, 6))
sns.boxplot(x='Category', y='Likes', data=df)
plt.title('Likes per Category')
plt.xticks(rotation=45)
plt.show()

# Look for trends over time
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Likes'])
plt.title('Likes Over Time')
plt.xlabel('Date')
plt.ylabel('Likes')
plt.show()


In [None]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Check data types and convert if needed
print(df.dtypes)


# Outlier detection and handling (example using IQR)
Q1 = df['Likes'].quantile(0.25)
Q3 = df['Likes'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['Likes'] >= lower_bound) & (df['Likes'] <= upper_bound)]

# Display the cleaned DataFrame
print(df.head())