<a href="https://colab.research.google.com/github/JaafarBK02/CS122/blob/main/social_media_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sbs


df = pd.read_csv('sample_data/social_media_posts.csv')
df.head()


Unnamed: 0,user_id,post_id,post_content,likes,shares,post_date
0,1001,1,Loving the new iPhone! #Apple #Tech https://ap...,120.0,50.0,2024-02-20
1,1002,2,,95.0,30.0,2024-02-19
2,1003,3,Check out my new sneakers! #Nike #style 👟,,75.0,2024-02-18
3,1004,4,Can’t wait to travel again! ✈️ #wanderlust #tr...,180.0,65.0,2024-02-17
4,1005,5,What a disaster... didn’t work at all! 😞 #fail,50.0,20.0,2024-02-16


### Data Wrangling/munging

In [5]:
# Let's start by handling missing values.
print("----- Missing values count -----")
print(df.isnull().sum())

#likes
median_likes= df['likes'].median()
df['likes'] = df['likes'].fillna(median_likes)
print(f"\nFilled 'likes' with median value: {median_likes}")

#shares
mean_shares=df['shares'].mean()
df['shares'] = df['shares'].fillna(mean_shares)
print(f"Filled 'shares' with mean value: {round(mean_shares)}")

#post_content
df['post_content']= df['post_content'].fillna('No Text')
print("Filled 'post_content' with 'No Text'")





----- Missing values count -----
user_id         0
post_id         0
post_content    0
likes           0
shares          0
post_date       0
dtype: int64

Filled 'likes' with median value: 210.0
Filled 'shares' with mean value: 87
Filled 'post_content' with 'No Text'


In [9]:
# Let's clean post_content
import re

#helpers
URL_RE = re.compile(r'https?://\S+|www\.\S+')
TAG_RE = re.compile(r'[#@][A-Za-z0-9_]+')
EMO_RE = re.compile(r'[\U00010000-\U0010ffff]')
PUNC_RE = re.compile(r'[^\w\s]')



# functions
def clean_post_content(text):
 """: Cleans the text by removing URLs, mentions, hashtags,punctuation, emojis, and extra spaces. Returns a plain string containing only the cleaned text."""
 text = str(text)
 text = re.sub(r'https?://\S+|www\.\S+', '', text) # remove URLs
 text = re.sub(r'[@#][A-Za-z0-9_]+', '', text)  # remove # and @
 text = re.sub(r'[\U00010000-\U0010ffff]', '', text) # remove emojis
 text = re.sub(r'[^\w\s]', '', text)  # remove punctuation

 return text


df['clean_post_content'] = df['post_content'].apply(clean_post_content)





Unnamed: 0,user_id,post_id,post_content,likes,shares,post_date,clean_post_content
0,1001,1,Loving the new iPhone! #Apple #Tech https://ap...,120.0,50.0,2024-02-20,Loving the new iPhone
1,1002,2,No Text,95.0,30.0,2024-02-19,No Text
2,1003,3,Check out my new sneakers! #Nike #style 👟,210.0,75.0,2024-02-18,Check out my new sneakers
3,1004,4,Can’t wait to travel again! ✈️ #wanderlust #tr...,180.0,65.0,2024-02-17,Cant wait to travel again
4,1005,5,What a disaster... didn’t work at all! 😞 #fail,50.0,20.0,2024-02-16,What a disaster didnt work at all
...,...,...,...,...,...,...,...
495,1116,496,Went to the beach with @seasidelovers 🏖️ #beac...,250.0,120.0,2022-10-13,Went to the beach with
496,1117,497,Had an awesome brunch with @weekend_brunchers ...,220.0,100.0,2022-10-12,Had an awesome brunch with
497,1118,498,Spent the day at the park with @outdoor_explor...,210.0,95.0,2022-10-11,Spent the day at the park with
498,1119,499,Went for a scenic drive with @roadtrip_fan 🚗 #...,230.0,105.0,2022-10-10,Went for a scenic drive with


In [14]:
# Lastly, we will Convert post_date to datetime format and extract hashtags and store them in a new column Hashtags

df['post_date'] = pd.to_datetime(df['post_date'], errors='coerce')

def extract_hashtags(text):
  return re.findall(r'#\w+', str(text))

df['Hashtags'] = df['post_content'].apply(extract_hashtags)
df



Unnamed: 0,user_id,post_id,post_content,likes,shares,post_date,clean_post_content,Hashtags
0,1001,1,Loving the new iPhone! #Apple #Tech https://ap...,120.0,50.0,2024-02-20,Loving the new iPhone,"[#Apple, #Tech]"
1,1002,2,No Text,95.0,30.0,2024-02-19,No Text,[]
2,1003,3,Check out my new sneakers! #Nike #style 👟,210.0,75.0,2024-02-18,Check out my new sneakers,"[#Nike, #style]"
3,1004,4,Can’t wait to travel again! ✈️ #wanderlust #tr...,180.0,65.0,2024-02-17,Cant wait to travel again,"[#wanderlust, #travel]"
4,1005,5,What a disaster... didn’t work at all! 😞 #fail,50.0,20.0,2024-02-16,What a disaster didnt work at all,[#fail]
...,...,...,...,...,...,...,...,...
495,1116,496,Went to the beach with @seasidelovers 🏖️ #beac...,250.0,120.0,2022-10-13,Went to the beach with,"[#beachfun, #sunshine]"
496,1117,497,Had an awesome brunch with @weekend_brunchers ...,220.0,100.0,2022-10-12,Had an awesome brunch with,"[#brunchvibes, #foodie]"
497,1118,498,Spent the day at the park with @outdoor_explor...,210.0,95.0,2022-10-11,Spent the day at the park with,"[#naturelover, #familyfun]"
498,1119,499,Went for a scenic drive with @roadtrip_fan 🚗 #...,230.0,105.0,2022-10-10,Went for a scenic drive with,"[#roadtrip, #adventure]"
