# Instagram Data Analysis

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = 'plotly_white'

data = pd.read_csv('/kaggle/input/instagram-view-data/Instagram data.csv', encoding='latin-1')

In [None]:
print(data.head())

In [None]:
# Checking the data type
data.dtypes

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
print(data.isnull().sum())

### Distribution of the Impressions Column

 >  Start by exploring the feature that contains data about reach.  
 > The Impressions column contains the data about the reach of an Instagram post.

In [None]:
fig = px.histogram(data, 
                   x='Impressions', 
                   nbins=10, 
                   title='Distribution of Impressions')
fig.show()

> In graph, Highest count of the distribution of impressions lies between 0 to 5k then between 5k to 10k.

### Number of impressions on each post over time: 

In [None]:
fig = px.line(data, x=data.index,
                   y='Impressions',  
                   title='Distribution of Impressions')
fig.show()

### Metrics like Likes, Saves, and Follows from each post over time:

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data.index, y=data['Likes'], name='Likes'))
fig.add_trace(go.Scatter(x=data.index, y=data['Saves'], name='Saves'))
fig.add_trace(go.Scatter(x=data.index, y=data['Follows'], name='Follows'))

fig.update_layout(title='Metrics Over Time',
                  xaxis_title='Date',
                  yaxis_title='Count')

fig.show()



###  Distribution of reach from different sources:

In [None]:
reach_sources = ['From Home', 'From Hashtags', 'From Explore', 'From Other']
reach_counts = [data[source].sum() for source in reach_sources ]

colors = ['#FFB6C1', '#87CEFA', '#90EE90', '#FFDAB9']

fig = px.pie(data_frame=data, names=reach_sources, 
             values=reach_counts, 
             title='Reach from Different Sources',
             color_discrete_sequence=colors)
fig.show()


### Distribution Of Engagement Sources: 

In [None]:
engagement_metrics = ['Saves', 'Comments', 'Shares', 'Likes']
engagement_counts = [data[metric].sum() for metric in engagement_metrics]

colors = ['#FFB6C1', '#87CEFA', '#90EE90', '#FFDAB9']

fig = px.pie(data_frame=data, names=engagement_metrics, 
             values=engagement_counts, 
             title='Engagement Sources',
             color_discrete_sequence=colors)


fig.show()

### Relationship between the number of profile visits and follows: 

In [None]:
fig = px.scatter(data, 
                 x='Profile Visits', 
                 y='Follows', 
                 trendline = 'ols',
                 title='Profile Visits vs. Follows')

fig.show()

###  Type of hashtags used in the posts using a wordcloud: 

In [None]:
from wordcloud import WordCloud

hashtags = ' '.join(data['Hashtags'].astype(str))
wordcloud = WordCloud().generate(hashtags)

fig = px.imshow(wordcloud, title='Hashtags Word Cloud')
fig.show()

* how to gererate my own hashtags*

### Correlation between all the features: 

In [None]:
corr_matrix = data.corr()

fig = go.Figure(data=go.Heatmap(z=corr_matrix.values,
                               x=corr_matrix.columns,
                               y=corr_matrix.index,
                               colorscale='RdBu',
                               zmin=-1,
                               zmax=1))

fig.update_layout(title='Correlation Matrix',
                 xaxis_title='Features',
                 yaxis_title='Features')

fig.show()

### Distribution of hashtags to see which hashtag is used the most in all the posts: 

In [None]:
all_hashtags = []

# Iterate through each row in the 'Hashtags' column

for row in data['Hashtags']:
    hashtags = str(row).split()
    hashtags = [tag.strip() for tag in hashtags]
    all_hashtags.extend(hashtags)
    
# Create a pandas DataFrame to store the hashtag distribution

hashtags_distribution = pd.Series(all_hashtags).value_counts().reset_index()
hashtags_distribution.columns = ['Hashtag','Count']

fig = px.bar(hashtags_distribution, x='Hashtag',
            y='Count', title='Distribution of hashtags')

fig.show()


### Distribution of likes and impressions received from the presence of each hashtag on the post: 

In [None]:
# Create a dictionary to store the likes and impressions for each hashtag
hashtag_likes = {}
hashtag_impressions = {}

for index, row in data.iterrows():
    hashtags = str(row['Hashtags']).split()
    for hashtag in hashtags:
        hashtag = hashtag.strip()
        if hashtag not in hashtag_likes:
            hashtag_likes[hashtag]=0
            hashtag_impressions[hashtag] = 0
        hashtag_likes[hashtag] += row['Likes']
        hashtag_impressions[hashtag] += row['Impressions']

        # Create a DataFrame for likes distribution
likes_distribution = pd.DataFrame(list(hashtag_likes.items()), columns=['Hashtag', 'Likes'])      

# Create a DataFrame for impression distribution
impressions_distribution = pd.DataFrame(list(hashtag_impressions.items()), columns=['Hashtag','Impressions'])

fig_likes = px.bar(likes_distribution, x='Hashtag', y='Likes',
                 title='Likes Distribution for Each Hashtag')

fig_impressions = px.bar(impressions_distribution, x='Hashtag', 
                         y='Impressions', 
                         title='Impressions Distribution for Each Hashtag')
        
fig_likes.show()
fig_impressions.show() 

*I  want to thanks to Aman Kharwal for this practice project, resources and guidence*