In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np

In [2]:
# Load the Raw Dataset
df = pd.read_csv("../data/instagram_influencers_raw.csv")

In [3]:
df.head()

Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States


In [4]:
# Understand the Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   rank               200 non-null    int64 
 1   channel_info       200 non-null    object
 2   influence_score    200 non-null    int64 
 3   posts              200 non-null    object
 4   followers          200 non-null    object
 5   avg_likes          200 non-null    object
 6   60_day_eng_rate    200 non-null    object
 7   new_post_avg_like  200 non-null    object
 8   total_likes        200 non-null    object
 9   country            138 non-null    object
dtypes: int64(2), object(8)
memory usage: 15.8+ KB


In [5]:
df.describe()

Unnamed: 0,rank,influence_score
count,200.0,200.0
mean,100.5,81.82
std,57.879185,8.878159
min,1.0,22.0
25%,50.75,80.0
50%,100.5,84.0
75%,150.25,86.0
max,200.0,93.0


In [6]:
# Rename Columns
df.columns = [
    "rank",
    "username",
    "influence_score",
    "posts",
    "followers",
    "avg_likes",
    "engagement_rate_60_days",
    "new_post_avg_likes",
    "total_likes",
    "country"
]

In [7]:
df.columns

Index(['rank', 'username', 'influence_score', 'posts', 'followers',
       'avg_likes', 'engagement_rate_60_days', 'new_post_avg_likes',
       'total_likes', 'country'],
      dtype='object')

In [8]:
# Handle Missing Values
df.isna().sum()

rank                        0
username                    0
influence_score             0
posts                       0
followers                   0
avg_likes                   0
engagement_rate_60_days     0
new_post_avg_likes          0
total_likes                 0
country                    62
dtype: int64

In [9]:
#Fill Null values
df['country']= df['country'].fillna('Unknown')

In [10]:
# Handle Missing Values
df.isna().sum()

rank                       0
username                   0
influence_score            0
posts                      0
followers                  0
avg_likes                  0
engagement_rate_60_days    0
new_post_avg_likes         0
total_likes                0
country                    0
dtype: int64

In [11]:
df['country'].unique()

array(['Spain', 'United States', 'Unknown', 'Canada', 'Brazil',
       'Netherlands', 'United Kingdom', 'India', 'Uruguay', 'Turkey',
       'Indonesia', 'Colombia', 'France', 'Australia', 'Italy',
       'United Arab Emirates', 'Puerto Rico', "CÃ´te d'Ivoire",
       'Anguilla', 'Switzerland', 'Sweden', 'British Virgin Islands',
       'Czech Republic', 'Mexico', 'Germany', 'Russia'], dtype=object)

In [12]:
# Clean Country Column
df["country"] = df["country"].str.strip().str.title()

In [13]:
df.head()

Unnamed: 0,rank,username,influence_score,posts,followers,avg_likes,engagement_rate_60_days,new_post_avg_likes,total_likes,country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,Unknown
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States


In [14]:
# Convert K / M / B values to Pure Numbers
def convert_to_number(value):
    if isinstance(value, str):
        value = value.lower().strip()
        
        if value.endswith('k'):
            return float(value.replace('k', '')) * 1000
        elif value.endswith('m'):
            return float(value.replace('m', '')) * 1000000
        elif value.endswith('b'):
            return float(value.replace('b', '')) * 1000000000
        else:
            return float(value)
    
    return value

In [15]:
# Apply function to all relevant columns
cols_to_convert = [
    "posts",
    "followers",
    "avg_likes",
    "new_post_avg_likes",
    "total_likes"
]

for col in cols_to_convert:
    df[col] = df[col].apply(convert_to_number)

In [16]:
# Convert to Numeric Type
df['posts'] = df['posts'].astype(int)
df['followers'] = df['followers'].astype(int)
df['total_likes'] = df['total_likes'].astype(int)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   rank                     200 non-null    int64  
 1   username                 200 non-null    object 
 2   influence_score          200 non-null    int64  
 3   posts                    200 non-null    int64  
 4   followers                200 non-null    int64  
 5   avg_likes                200 non-null    float64
 6   engagement_rate_60_days  200 non-null    object 
 7   new_post_avg_likes       200 non-null    float64
 8   total_likes              200 non-null    int64  
 9   country                  200 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 15.8+ KB


In [18]:
# Verify Conversion
df[cols_to_convert].head()

Unnamed: 0,posts,followers,avg_likes,new_post_avg_likes,total_likes
0,3300,475800000,8700000.0,6500000.0,29000000000
1,6900,366200000,8300000.0,5900000.0,57400000000
2,890,357300000,6800000.0,4400000.0,6000000000
3,1800,342700000,6200000.0,3300000.0,11500000000
4,6800,334100000,1900000.0,665300.0,12500000000


In [19]:
# Remove % and Convert to Float
df["engagement_rate_60_days"] = (
    df["engagement_rate_60_days"]
    .astype(str)
    .str.replace("%", "", regex=False)
    .astype(float)
)

In [20]:
#Fill Null values
df['engagement_rate_60_days']= df['engagement_rate_60_days'].fillna(0)

In [21]:
# Verify the Result
df["engagement_rate_60_days"].head(10)

0    1.39
1    1.62
2    1.24
3    0.97
4    0.20
5    0.88
6    1.20
7    0.76
8    0.35
9    0.59
Name: engagement_rate_60_days, dtype: float64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   rank                     200 non-null    int64  
 1   username                 200 non-null    object 
 2   influence_score          200 non-null    int64  
 3   posts                    200 non-null    int64  
 4   followers                200 non-null    int64  
 5   avg_likes                200 non-null    float64
 6   engagement_rate_60_days  200 non-null    float64
 7   new_post_avg_likes       200 non-null    float64
 8   total_likes              200 non-null    int64  
 9   country                  200 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 15.8+ KB


In [23]:
# Number of duplicates
df.duplicated().sum()

np.int64(0)

In [24]:
df.head()

Unnamed: 0,rank,username,influence_score,posts,followers,avg_likes,engagement_rate_60_days,new_post_avg_likes,total_likes,country
0,1,cristiano,92,3300,475800000,8700000.0,1.39,6500000.0,29000000000,Spain
1,2,kyliejenner,91,6900,366200000,8300000.0,1.62,5900000.0,57400000000,United States
2,3,leomessi,90,890,357300000,6800000.0,1.24,4400000.0,6000000000,Unknown
3,4,selenagomez,93,1800,342700000,6200000.0,0.97,3300000.0,11500000000,United States
4,5,therock,91,6800,334100000,1900000.0,0.2,665300.0,12500000000,United States


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   rank                     200 non-null    int64  
 1   username                 200 non-null    object 
 2   influence_score          200 non-null    int64  
 3   posts                    200 non-null    int64  
 4   followers                200 non-null    int64  
 5   avg_likes                200 non-null    float64
 6   engagement_rate_60_days  200 non-null    float64
 7   new_post_avg_likes       200 non-null    float64
 8   total_likes              200 non-null    int64  
 9   country                  200 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 15.8+ KB


In [26]:
df.to_csv("../data/instagram_influencers_cleaned.csv", index=False)