# Pandas Deep Dive: Techniques for Data Wrangling and Analysis

### Data from [Kaggle](https://www.kaggle.com/datasets/shreyasajal/linkedin-influencers-data)

### About Dataset:
This dataset contains LinkedIn Influencers' post details and other details(post dependent as well as independent) per post. This dataset can be used to analyze LinkedIn reach based on post content and related account details.


In [1]:
# import and installation

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 20)

## Data Inspection

In [None]:
df = pd.read_csv('influencers_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34012 entries, 0 to 34011
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         34012 non-null  int64  
 1   name               34012 non-null  object 
 2   headline           34012 non-null  object 
 3   location           31740 non-null  object 
 4   followers          33970 non-null  float64
 5   connections        25713 non-null  object 
 6   about              34012 non-null  object 
 7   time_spent         34011 non-null  object 
 8   content            31996 non-null  object 
 9   content_links      34012 non-null  object 
 10  media_type         26779 non-null  object 
 11  media_url          34012 non-null  object 
 12  num_hashtags       34012 non-null  int64  
 13  hashtag_followers  34012 non-null  int64  
 14  hashtags           34012 non-null  object 
 15  reactions          34012 non-null  int64  
 16  comments           340

In [52]:
df.shape

(34012, 18)

In [54]:
df.head()

Unnamed: 0,name,headline,location,followers,connections,about,time_spent,content,content_links,media_type,media_url,num_hashtags,hashtag_followers,hashtags,reactions,comments,views,votes
0,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman f...,1 day ago,Robert Lerman w...,[['https://www.l...,article,['https://www.ur...,4,0,[['#workbasedlea...,12,1,,
1,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman f...,1 week ago,National disabil...,[['https://www.l...,,[],0,0,[],11,0,,
2,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman f...,2 months ago,,[],,[],0,0,[],15,0,,
3,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman f...,2 months ago,Exploring in thi...,[['https://www.l...,article,['https://www.tl...,4,0,[['#careerplanni...,44,0,,
4,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman f...,2 months ago,I count myself f...,[['https://www.l...,article,['https://gritda...,3,0,[['#verifiedresu...,22,2,,


In [4]:
df.drop(df.columns[0], axis=1, inplace=True)

In [5]:
df.describe()

Unnamed: 0,followers,num_hashtags,hashtag_followers,reactions,comments,views
count,33970.0,34012.0,34012.0,34012.0,34012.0,0.0
mean,1125922.0,2.099788,0.0,472.956486,26.977273,
std,3057750.0,3.517457,0.0,4163.929944,216.364372,
min,171.0,0.0,0.0,0.0,0.0,
25%,99148.0,0.0,0.0,7.0,0.0,
50%,408254.0,0.0,0.0,36.0,2.0,
75%,719334.0,3.0,0.0,143.0,14.0,
max,18289350.0,48.0,0.0,391498.0,32907.0,


In [6]:
# all the values are missing in the column 'views', so it's better to drop this column
df.drop('views', axis=1, inplace=True)

In [7]:
# changing data type to category for less memory usage
df.about = df.about.astype("category")
df.headline = df.headline.astype("category")

In [8]:
# whenever there is a value '500+' in columns 'connections' replace it with '501'
df.loc[df['connections'] =='500+', 'connections'] = '501'

# Alternative 1: Using replace method
df['connections'] = df['connections'].replace('500+', '501')

# Alternative 2: Using apply method
df['connections'] = df['connections'].apply(lambda x: '501' if x == '500+' else x)

# Alternative 3: Using numpy.where function
df['connections'] = np.where(df['connections'] == '500+', '501', df['connections'])

# Convert the 'connections' column to numerical
df['connections'] = pd.to_numeric(df['connections'], errors='coerce').astype(pd.Int32Dtype())

## Examples of data Manipulation with Pandas

In [None]:
# Filtering and Selection
filtered_df = df[df['followers'] > 1000]  # Filter rows where followers > 1000
queried_df = df.query('connections > 1500')  # Filter rows using query string
loc_selected_df = df.loc[df['content'].str.contains('inspiring', case=False)]  # Select rows where content contains 'inspiring'
iloc_selected_df = df.iloc[df['comments'].idxmax()]  # Select the row with the maximum comments
isin_df = df[df['name'].isin(['Influencer B', 'Influencer C'])]  # Filter rows where name is in the list
has_high_votes = df['votes'].any() > 70  # Check if any influencer has votes greater than 70

# Conditional Replacements
replaced_df = df.where(df['followers'] <= 10, 0)  # Replace followers with 0 where followers <= 10
masked_df = df.mask(df['connections'] > 300)  # mask for filtering dataframe where 'connections' > 300
replaced_vals_df = df.replace('Great post!', 'Awesome content')  # Replace 'Great post!' with 'Awesome content'
name_length_df = df['name'].apply(lambda x: len(x)) # storing number of character of each name  
connection_status_df = df[df['connections'].notna()]['connections'].map(lambda x: 'Connected' if x > 300 else 'Not Connected') # based 
# on the values in the 'connections' column that are not null and whether they are greater than 300


# Grouping and Aggregation
grouped_agg_df = df.groupby('name').agg({'followers': 'max', 'reactions': 'sum'})  # Aggregate by max and sum
grouped_transform_df = df.groupby('name')['comments'].transform('mean')  # Broadcast mean comments per influencer
grouped_filter_df = df.groupby('name').filter(lambda x: x['followers'].sum() > 1000)  # Filter groups with total followers > 100000

# Sorting and Ranking
sorted_df = df.sort_values('votes', ascending=False)  # Sort by 'votes' column in descending order
largest_df = df.nlargest(2, 'comments')  # Get top 2 rows with largest number of comments
ranked_df = df.rank(ascending=False)  # Compute rank of values

# Dropping and Deleting
dropped_cols_df = df.drop(columns=['content', 'votes'])  # Drop 'content' and 'votes' columns
dropped_rows_df = df.drop(index=0)  # Drop first row
deduplicated_df = df.drop_duplicates()  # Remove duplicate rows
droppedna_df = df.dropna()  # Drop rows with missing values

# Boolean Indexing
inverted_condition_df = df[~(df['connections'] > 1500)]  # Invert condition to filter rows

# String Manipulation based on Condition
start_with_df = df[df['name'].str.startswith('Influencer')]  # Filter rows where name starts with 'Influencer'
end_with_df = df[df['name'].str.endswith('A')]  # Filter rows where name ends with 'A'
regex_match_df = df[df['name'].str.match('[A-C]')]  # Filter rows where name matches a regex pattern

# Handling Null Values
dropna_subset_df = df.dropna(subset=['content_links'])  # Drop rows with missing values in 'content_links'
filledna_df = df.fillna({'votes': 0, 'media_type': 'None'})  # Fill missing 'votes' with 0 and 'media_type' with 'None'
interpolated_df = df.interpolate()  # Interpolate missing values

# Datetime Conditions
from datetime import datetime
# we do not have a date column here so I comment them out. It is just for learning 
# df['date_column'] = [datetime(2023, 1, 1), datetime(2023, 1, 2), datetime(2023, 1, 3)]
# date_filtered_df = df[df['date_column'] > '2023-01-02']  # Filter rows based on datetime condition
# date_range_filtered_df = df[df['date_column'].between('2023-01-01', '2023-01-02')]  # Filter rows within a datetime range

# Combining Conditions
combined_and_df = df[(df['followers'] > 1000) & (df['connections'] > 200)]  # Combine conditions using logical AND
combined_or_df = df[(df['followers'] > 750100000) | (df['connections'] > 200)]  # Combine conditions using logical OR


