In [None]:
'''
Project Overview[Instagram Stories Daily User Creation Patterns]:

Goals:To understand the distribution of stories created by users daily.  
Project involves analyzing user storytelling behavior to optimize engagement strategies.

'''

In [2]:
# Importing necessary libraries for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Getting the Dataset
stories_data = pd.read_csv(r'D:\Data Journey\Python-Summer-Party\DataSets\stories_data.csv')

# Display the first few rows to understand the data
print(stories_data.head(10))  # Shows the first 5 rows with columns
print(list(stories_data.columns))
print('Number of rows and columns is:', stories_data.shape)

    user_id  story_date  story_count
0  user_001    7/3/2024          3.0
1  user_001    7/3/2024          3.0
2  user_001   8/15/2024          5.0
3  user_001   9/10/2024          0.0
4  user_001   10/5/2024         20.0
5  user_001   7/15/2024          2.0
6  user_002    7/3/2024          4.0
7  user_002    7/4/2024          3.0
8  user_002         NaN          6.0
9  user_002  12/25/2024          1.0
['user_id', 'story_date', 'story_count']
Number of rows and columns is: (58, 3)


In [5]:
# Question One
# Handling data inconsistencies in the story_date column & converting it to datetime and dropping invalid dates.
stories_data['story_date'] = pd.to_datetime(stories_data['story_date'],errors='coerce') # errors = coerce: nvalid dates like "invalid_date" will become NaT (Not a Time).
stories_data = stories_data.dropna(subset=['story_date'])
stories_data


Unnamed: 0,user_id,story_date,story_count
0,user_001,2024-07-03,3.0
1,user_001,2024-07-03,3.0
2,user_001,2024-08-15,5.0
3,user_001,2024-09-10,0.0
4,user_001,2024-10-05,20.0
5,user_001,2024-07-15,2.0
6,user_002,2024-07-03,4.0
7,user_002,2024-07-04,3.0
9,user_002,2024-12-25,1.0
10,user_002,2025-01-15,7.0


In [10]:
# Question Two
# Calculating the 25th, 50th, and 75th percentiles of the number of stories created per user per day
# 0. Cleaning data column first
cleaned_story_counts = stories_data['story_count'].dropna()
p25 = np.percentile(cleaned_story_counts, 25)
p50 = np.percentile(cleaned_story_counts, 50)  # median
p75 = np.percentile(cleaned_story_counts, 75)

print("25th percentile:", p25)
print("50th percentile (median):", p50)
print("75th percentile:", p75)


25th percentile: 3.0
50th percentile (median): 4.0
75th percentile: 8.75


In [17]:
# Question Three
# Group by user and check if any of their story_count values exceed 10
users_with_more_than_10 = stories_data.groupby("user_id")["story_count"].apply(lambda x: (x > 10).any())

# Count how many users satisfied the condition
num_users_with_more_than_10 = users_with_more_than_10.sum()  # True counts as 1

# Total number of unique users
total_users = users_with_more_than_10.shape[0]

# Proportion
proportion = num_users_with_more_than_10 / total_users * 100

print("Proportion of users with >10 stories on any day:", proportion)



Proportion of users with >10 stories on any day: 36.36363636363637
